diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 7dd16f856cd..3770189b447 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -54,13 +54,13 @@ case "${IMAGE_NAME}" in
   executorch-ubuntu-22.04-mediatek-sdk)
     MEDIATEK_SDK=yes
     CLANG_VERSION=12
-    ANDROID_NDK_VERSION=r27b
+    ANDROID_NDK_VERSION=r28c
     ;;
   executorch-ubuntu-22.04-clang12-android)
     LINTRUNNER=""
     CLANG_VERSION=12
     # From https://developer.android.com/ndk/downloads
-    ANDROID_NDK_VERSION=r27b
+    ANDROID_NDK_VERSION=r28c
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index ef3282ba6cc..49b079047a3 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-40b02a2dc61bbf901a2df91719f47c98d65368ec
+44d8d54e38c0258357d4e92e1fefe21e845947a3
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 8c9330d6f2c..aafc7565373 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e
+cf9d09490c7f6685ec68d5db3acf2e0d73c54d00
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index dcd2afa7a13..5527b9b4d6d 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -16,18 +16,21 @@ hypothesis==6.84.2
 parameterized==0.9.0
 
 # Doc build requirements, same as https://github.com/pytorch/pytorch/blob/main/.ci/docker/requirements-docs.txt
-sphinx==5.3.0
+sphinx==7.2.6
+sphinxcontrib.katex==0.9.10
+breathe==4.36.0  # only if generating C++
+exhale==0.3.7  # only if generating C++ docs
+docutils==0.18.1,<0.21
+sphinx-design==0.6.1
+sphinxcontrib-mermaid==1.0.0
+myst-parser==3.0.1  # if want to contribute in markdown
+sphinx-gallery==0.14.0  # only if hosting interactive tutorials
+sphinx-sitemap==2.7.1
 sphinx-reredirects==0.1.4
-sphinx-gallery==0.14.0
-breathe==4.34.0
-exhale==0.2.3
-docutils==0.16
 matplotlib>=3.9.4
+sphinx-copybutton==0.5.2
 # PyTorch Theme
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-myst-parser==0.18.1
-sphinx_design==0.4.1
-sphinx-copybutton==0.5.0
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
 
 # script unit test requirements
 yaspin==3.1.0
diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
index 7f34e8afb63..30835cf5085 100755
--- a/.ci/scripts/build-qnn-sdk.sh
+++ b/.ci/scripts/build-qnn-sdk.sh
@@ -38,6 +38,7 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM=ON \
       -DEXECUTORCH_BUILD_EXTENSION_EXTENSION_LLM_RUNNER=ON \
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3
diff --git a/.ci/scripts/setup-openvino.sh b/.ci/scripts/setup-openvino.sh
index ff667619125..587494f46ac 100755
--- a/.ci/scripts/setup-openvino.sh
+++ b/.ci/scripts/setup-openvino.sh
@@ -10,19 +10,17 @@ set -ex
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-git clone https://github.com/openvinotoolkit/openvino.git
-cd openvino && git checkout releases/2025/1
-git submodule update --init --recursive
-sudo ./install_build_dependencies.sh
-mkdir build && cd build
-cmake .. -DCMAKE_BUILD_TYPE=Release -DENABLE_PYTHON=ON
-make -j$(nproc)
+# Download and install OpenVINO from release packages
+OPENVINO_VERSION="2025.3"
+OPENVINO_BUILD="2025.3.0.19807.44526285f24"
+OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION}/linux/openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64.tgz"
 
-cd ..
-cmake --install build --prefix dist
+curl -Lo /tmp/openvino_toolkit.tgz --retry 3 --fail ${OPENVINO_URL}
+tar -xzf /tmp/openvino_toolkit.tgz
+mv openvino_toolkit_ubuntu22_${OPENVINO_BUILD}_x86_64 openvino
 
-source dist/setupvars.sh
-cd ../backends/openvino
+source openvino/setupvars.sh
+cd backends/openvino
 pip install -r requirements.txt
 cd scripts
 ./openvino_build.sh --enable_python
diff --git a/.ci/scripts/setup-samsung-linux-deps.sh b/.ci/scripts/setup-samsung-linux-deps.sh
index ed704b2bfbd..c1f2912713b 100644
--- a/.ci/scripts/setup-samsung-linux-deps.sh
+++ b/.ci/scripts/setup-samsung-linux-deps.sh
@@ -11,9 +11,9 @@ set -ex
 
 download_ai_lite_core() {
   API_BASE="https://soc-developer.semiconductor.samsung.com/api/v1/resource/ai-litecore/download"
-  API_KEY="kn10SoSY3hkC-9Qny5TqD2mnqVrlupv3krnjLeBt5cY"
+  API_KEY=$SAMSUNG_AI_LITECORE_KEY
 
-  VERSION="0.5"
+  VERSION="0.7"
   OS_NAME="Ubuntu 22.04"
   OUT_FILE="/tmp/exynos-ai-litecore-v${VERSION}.tar.gz"
   TARGET_PATH="/tmp/exynos_ai_lite_core"
@@ -52,7 +52,7 @@ download_ai_lite_core() {
 install_enn_backend() {
   NDK_INSTALLATION_DIR=/opt/ndk
   rm -rf "${NDK_INSTALLATION_DIR}" && sudo mkdir -p "${NDK_INSTALLATION_DIR}"
-  ANDROID_NDK_VERSION=r27b
+  ANDROID_NDK_VERSION=r28c
 
   # build Exynos backend
   export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
@@ -62,7 +62,7 @@ install_enn_backend() {
   export PYTHONPATH=${PYTHONPATH:-}:${EXECUTORCH_ROOT}/..
 }
 
-AI_LITE_CORE_VERSION=0.5.0
+AI_LITE_CORE_VERSION=0.7.0
 
 download_ai_lite_core ${AI_LITE_CORE_VERSION}
 install_enn_backend
diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh
new file mode 100755
index 00000000000..bae7dd6af16
--- /dev/null
+++ b/.ci/scripts/test-cuda-build.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+CUDA_VERSION=${1:-"12.6"}
+
+echo "=== Testing ExecuTorch CUDA ${CUDA_VERSION} Build ==="
+
+# Function to build and test ExecuTorch with CUDA support
+test_executorch_cuda_build() {
+    local cuda_version=$1
+
+    echo "Building ExecuTorch with CUDA ${cuda_version} support..."
+    echo "ExecuTorch will automatically detect CUDA and install appropriate PyTorch wheel"
+
+    # Check available resources before starting
+    echo "=== System Information ==="
+    echo "Available memory: $(free -h | grep Mem | awk '{print $2}')"
+    echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')"
+    echo "CPU cores: $(nproc)"
+    echo "CUDA version check:"
+    nvcc --version || echo "nvcc not found"
+    nvidia-smi || echo "nvidia-smi not found"
+
+    # Set CMAKE_ARGS to enable CUDA build - ExecuTorch will handle PyTorch installation automatically
+    export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+
+    echo "=== Starting ExecuTorch Installation ==="
+    # Install ExecuTorch with CUDA support with timeout and error handling
+    timeout 5400 ./install_executorch.sh || {
+        local exit_code=$?
+        echo "ERROR: install_executorch.sh failed with exit code: $exit_code"
+        if [ $exit_code -eq 124 ]; then
+            echo "ERROR: Installation timed out after 90 minutes"
+        fi
+        exit $exit_code
+    }
+
+    echo "SUCCESS: ExecuTorch CUDA build completed"
+
+    # Verify the installation
+    echo "=== Verifying ExecuTorch CUDA Installation ==="
+
+    # Test that ExecuTorch was built successfully
+    python -c "
+import executorch
+print('SUCCESS: ExecuTorch imported successfully')
+"
+
+    # Test CUDA availability and show details
+    python -c "
+try:
+    import torch
+    print('INFO: PyTorch version:', torch.__version__)
+    print('INFO: CUDA available:', torch.cuda.is_available())
+
+    if torch.cuda.is_available():
+        print('SUCCESS: CUDA is available for ExecuTorch')
+        print('INFO: CUDA version:', torch.version.cuda)
+        print('INFO: GPU device count:', torch.cuda.device_count())
+        print('INFO: Current GPU device:', torch.cuda.current_device())
+        print('INFO: GPU device name:', torch.cuda.get_device_name())
+
+        # Test basic CUDA tensor operation
+        device = torch.device('cuda')
+        x = torch.randn(10, 10).to(device)
+        y = torch.randn(10, 10).to(device)
+        z = torch.mm(x, y)
+        print('SUCCESS: CUDA tensor operation completed on device:', z.device)
+        print('INFO: Result tensor shape:', z.shape)
+
+        print('SUCCESS: ExecuTorch CUDA integration verified')
+    else:
+        print('WARNING: CUDA not detected, but ExecuTorch built successfully')
+        exit(1)
+except Exception as e:
+    print('ERROR: ExecuTorch CUDA test failed:', e)
+    exit(1)
+"
+
+    echo "SUCCESS: ExecuTorch CUDA ${cuda_version} build and verification completed successfully"
+}
+
+# Main execution
+echo "Current working directory: $(pwd)"
+echo "Directory contents:"
+ls -la
+
+# Run the CUDA build test
+test_executorch_cuda_build "${CUDA_VERSION}"
diff --git a/.ci/scripts/test_backend_linux.sh b/.ci/scripts/test_backend.sh
similarity index 57%
rename from .ci/scripts/test_backend_linux.sh
rename to .ci/scripts/test_backend.sh
index d230860875d..a48cc9ec41a 100755
--- a/.ci/scripts/test_backend_linux.sh
+++ b/.ci/scripts/test_backend.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,16 +11,26 @@ SUITE=$1
 FLOW=$2
 ARTIFACT_DIR=$3
 
-REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv"
+REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.json"
 
 echo "Running backend test job for suite $SUITE, flow $FLOW."
 echo "Saving job artifacts to $ARTIFACT_DIR."
 
-# The generic Linux job chooses to use base env, not the one setup by the image
 eval "$(conda shell.bash hook)"
 CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 conda activate "${CONDA_ENV}"
 
+if [[ "$(uname)" == "Darwin" ]]; then
+    bash .ci/scripts/setup-conda.sh
+    eval "$(conda shell.bash hook)"
+    CONDA_RUN_CMD="${CONDA_RUN} --no-capture-output"
+    ${CONDA_RUN_CMD} pip install awscli==1.37.21
+    IS_MACOS=1
+else
+    CONDA_RUN_CMD=""
+    IS_MACOS=0
+fi
+
 export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
@@ -48,13 +59,23 @@ fi
 if [[ "$FLOW" == *arm* ]]; then
     # Setup ARM deps.
     .ci/scripts/setup-arm-baremetal-tools.sh
+    source examples/arm/ethos-u-scratch/setup_path.sh
+
+    if [[ "$FLOW" == *ethos_u* ]]; then
+        # Prepare a test runner binary that can run on the Corstone-3x0 FVPs
+        backends/arm/scripts/build_executorch.sh
+        backends/arm/test/setup_testing.sh
+    fi
 fi
 
-# We need the runner to test the built library.
-PYTHON_EXECUTABLE=python CMAKE_ARGS="$EXTRA_BUILD_ARGS" .ci/scripts/setup-linux.sh --build-tool cmake --build-mode Release --editable true
+if [[ $IS_MACOS -eq 1 ]]; then
+    SETUP_SCRIPT=.ci/scripts/setup-macos.sh
+else
+    SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+fi
+CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 
 EXIT_CODE=0
-python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$?
-
+${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
 # Generate markdown summary.
-python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
+${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
diff --git a/.ci/scripts/test_backend_macos.sh b/.ci/scripts/test_backend_macos.sh
deleted file mode 100755
index c31fd504b03..00000000000
--- a/.ci/scripts/test_backend_macos.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-set -eux
-
-SUITE=$1
-FLOW=$2
-ARTIFACT_DIR=$3
-
-REPORT_FILE="$ARTIFACT_DIR/test-report-$FLOW-$SUITE.csv"
-
-echo "Running backend test job for suite $SUITE, flow $FLOW."
-echo "Saving job artifacts to $ARTIFACT_DIR."
-
-${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
-
-bash .ci/scripts/setup-conda.sh
-eval "$(conda shell.bash hook)"
-
-PYTHON_EXECUTABLE=python
-${CONDA_RUN} --no-capture-output .ci/scripts/setup-macos.sh --build-tool cmake --build-mode Release
-
-EXIT_CODE=0
-${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.runner $SUITE --flow $FLOW --report "$REPORT_FILE" || EXIT_CODE=$?
-
-# Generate markdown summary.
-${CONDA_RUN} --no-capture-output python -m executorch.backends.test.suite.generate_markdown_summary "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 05b25299522..e5d815cfc00 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -43,7 +43,9 @@ def cli_export(command, model_dir):
 
 
 def check_causal_lm_output_quality(
-    model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0
+    model_id: str,
+    generated_tokens: List[int],
+    max_perplexity_threshold: float = 100.0,
 ):
     """
     Evaluates the quality of text generated by a causal language model by calculating its perplexity.
@@ -58,12 +60,24 @@ def check_causal_lm_output_quality(
     """
     logging.info(f"Starting perplexity check with model '{model_id}' ...")
     # Load model
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        low_cpu_mem_usage=True,
-        use_cache=False,
-        torch_dtype=torch.bfloat16,
-    )
+    cls_name = AutoModelForCausalLM
+    if "llava" in model_id:
+        from transformers import LlavaForConditionalGeneration
+
+        cls_name = LlavaForConditionalGeneration
+    try:
+        model = cls_name.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            use_cache=False,
+            torch_dtype=torch.bfloat16,
+        )
+    except TypeError:
+        model = cls_name.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.bfloat16,
+        )
 
     with torch.no_grad():
         outputs = model(input_ids=generated_tokens, labels=generated_tokens)
@@ -156,6 +170,86 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
     assert check_causal_lm_output_quality(model_id, generated_tokens) is True
 
 
+def test_llm_with_image_modality(
+    model_id, model_dir, recipe, *, quantize=True, run_only=False
+):
+    command = [
+        "optimum-cli",
+        "export",
+        "executorch",
+        "--model",
+        model_id,
+        "--task",
+        "multimodal-text-to-text",
+        "--recipe",
+        recipe,
+        "--output_dir",
+        model_dir,
+        "--use_custom_sdpa",
+        "--use_custom_kv_cache",
+        "--qlinear",
+        "8da4w",
+        "--qembedding",
+        "8w",
+    ]
+    if not run_only:
+        cli_export(command, model_dir)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.save_pretrained(model_dir)
+
+    # input
+    processor = AutoProcessor.from_pretrained(model_id)
+    image_url = "https://llava-vl.github.io/static/images/view.jpg"
+    conversation = [
+        {
+            "role": "system",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
+                }
+            ],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "url": image_url},
+                {
+                    "type": "text",
+                    "text": "What are the things I should be cautious about when I visit here?",
+                },
+            ],
+        },
+    ]
+    inputs = processor.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    from executorch.extension.llm.runner import GenerationConfig, MultimodalRunner
+
+    runner = MultimodalRunner(f"{model_dir}/model.pte", f"{model_dir}/tokenizer.model")
+    generated_text = runner.generate_text_hf(
+        inputs,
+        GenerationConfig(max_new_tokens=128, temperature=0, echo=False),
+        processor.image_token_id,
+    )
+    print(f"\nGenerated text:\n\t{generated_text}")
+    # Free memory before loading eager for quality check
+    del runner
+    gc.collect()
+    assert (
+        check_causal_lm_output_quality(
+            model_id, tokenizer.encode(generated_text, return_tensors="pt")
+        )
+        is True
+    )
+
+
 def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False):
     command = [
         "optimum-cli",
@@ -353,6 +447,9 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
         required=False,
         help="When provided, write the pte file to this directory. Otherwise, a temporary directory is created for the test.",
     )
+    parser.add_argument(
+        "--run_only", action="store_true", help="Skip export and only run the test"
+    )
     args = parser.parse_args()
 
     _text_generation_mapping = {
@@ -384,8 +481,16 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
         "vit": ("google/vit-base-patch16-224", test_vit),
     }
 
+    _multimodal_model_mapping = {
+        "gemma3-4b": ("google/gemma-3-4b-it", test_llm_with_image_modality),
+        "llava": ("llava-hf/llava-1.5-7b-hf", test_llm_with_image_modality),
+    }
+
     model_to_model_id_and_test_function = (
-        _text_generation_mapping | _mask_fill_mapping | _misc_model_mapping
+        _text_generation_mapping
+        | _mask_fill_mapping
+        | _misc_model_mapping
+        | _multimodal_model_mapping
     )
 
     if args.model not in model_to_model_id_and_test_function:
@@ -400,4 +505,5 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
             model_dir=tmp_dir if args.model_dir is None else args.model_dir,
             recipe=args.recipe,
             quantize=args.quantize,
+            run_only=args.run_only,
         )
diff --git a/.ci/scripts/test_ios_ci.sh b/.ci/scripts/test_ios_ci.sh
index a89c2cc5809..46c3f71f021 100755
--- a/.ci/scripts/test_ios_ci.sh
+++ b/.ci/scripts/test_ios_ci.sh
@@ -36,6 +36,7 @@ say() {
 
 say "Cloning the Demo App"
 
+git config --global http.postBuffer 524288000
 git clone --depth 1 https://github.com/meta-pytorch/executorch-examples.git
 
 say "Installing CoreML Backend Requirements"
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index 84278e290f6..d9e527e7c78 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -159,6 +159,7 @@ cmake_install_executorch_libraries() {
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT"
     cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
@@ -236,7 +237,7 @@ if [[ "${CUSTOM}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} model.use_sdpa_with_kv_cache=true"
 fi
 if [[ "${QE}" == "ON" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,1024\""
+  EXPORT_ARGS="${EXPORT_ARGS} quantization.embedding_quantize=\"8,768\""
 fi
 if [[ "${MPS}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} backend.mps.enabled=true model.enable_dynamic_shape=false debug.verbose=true"
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
index 5f472fad63b..a7ded52ccc6 100644
--- a/.ci/scripts/test_llama_torchao_lowbit.sh
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -31,6 +31,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 3deefe1d5bf..d8cb9596ffc 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -38,6 +38,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
@@ -107,7 +108,7 @@ cmake_build_llava_runner_for_android() {
 # only export the one without custom op for now since it's
 export_llava() {
     echo "Starting to export Llava. This will take about 6 mins"
-    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts --max-context-len 768
 }
 
 # Download a new image
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
index 74eb75c6ddd..34063a23374 100755
--- a/.ci/scripts/test_model.sh
+++ b/.ci/scripts/test_model.sh
@@ -48,22 +48,33 @@ prepare_artifacts_upload() {
   fi
 }
 
+
 build_cmake_executor_runner() {
   local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
   mkdir ${CMAKE_OUTPUT_DIR}
+  # Common options:
+  COMMON="-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE"
   if [[ "$backend_string_select" == "XNNPACK" ]]; then
     echo "Backend $backend_string_select selected"
-    (cd ${CMAKE_OUTPUT_DIR} \
-      && cmake -DCMAKE_BUILD_TYPE=Release \
+    cmake -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  elif [[ "$backend_string_select" == "CUDA" ]]; then
+    echo "Backend $backend_string_select selected"
+    cmake -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_CUDA=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        ${COMMON} \
+        -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4
   else
     cmake -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        ${COMMON} \
         -B${CMAKE_OUTPUT_DIR} .
     cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
   fi
@@ -131,13 +142,13 @@ test_model_with_xnnpack() {
     return 0
   fi
 
-  # Delegation
+  # Delegation and test with pybindings
   if [[ ${WITH_QUANTIZATION} == true ]]; then
     SUFFIX="q8"
-    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize
+    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize --test_after_export
   else
     SUFFIX="fp32"
-    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate
+    "${PYTHON_EXECUTABLE}" -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --test_after_export
   fi
 
   OUTPUT_MODEL_PATH="${MODEL_NAME}_xnnpack_${SUFFIX}.pte"
@@ -320,6 +331,13 @@ test_model_with_mediatek() {
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
 }
 
+test_model_with_cuda() {
+  # Export a basic .pte and .ptd, then run the model.
+  "${PYTHON_EXECUTABLE}" -m examples.cuda.scripts.export --model_name="${MODEL_NAME}" --output_dir "./"
+  build_cmake_executor_runner "CUDA"
+  ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "./${MODEL_NAME}.pte" --data_path "./aoti_cuda_blob.ptd"
+}
+
 
 if [[ "${BACKEND}" == "portable" ]]; then
   echo "Testing ${MODEL_NAME} with portable kernels..."
@@ -372,6 +390,12 @@ elif [[ "${BACKEND}" == "mediatek" ]]; then
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
+elif [[ "${BACKEND}" == "cuda" ]]; then
+  echo "Testing ${MODEL_NAME} with cuda..."
+  test_model_with_cuda
+  if [[ $? -eq 0 ]]; then
+    prepare_artifacts_upload
+  fi
 else
   set +e
   if [[ "${BACKEND}" == *"quantization"* ]]; then
diff --git a/.ci/scripts/test_openvino.sh b/.ci/scripts/test_openvino.sh
index 85884a6475b..2bb2115b1ec 100755
--- a/.ci/scripts/test_openvino.sh
+++ b/.ci/scripts/test_openvino.sh
@@ -10,7 +10,7 @@ set -ex
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-source openvino/dist/setupvars.sh
+source openvino/setupvars.sh
 cd backends/openvino/tests
 python test_runner.py --test_type ops
 python test_runner.py --test_type models
diff --git a/.ci/scripts/test_qnn_static_llama_eval.sh b/.ci/scripts/test_qnn_static_llama_eval.sh
new file mode 100644
index 00000000000..4baa28fe591
--- /dev/null
+++ b/.ci/scripts/test_qnn_static_llama_eval.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+echo ">>> Script invoked with arguments: $@"
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+# Download QNN_SDK. If already downloaded, export environment path
+source "$(dirname "${BASH_SOURCE[0]}")/../../backends/qualcomm/scripts/install_qnn_sdk.sh"
+install_qnn
+
+export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
+export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
+export PYTHONPATH=".."
+cp schema/program.fbs exir/_serialize/program.fbs
+cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
+cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# -------------------------------
+# Parse args
+# -------------------------------
+EXTRA_FLAGS=""
+THRESHOLD=62.0  # default fallback
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --flags)
+      EXTRA_FLAGS="$2"
+      shift 2
+      ;;
+    --threshold)
+      THRESHOLD="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      exit 1
+      ;;
+  esac
+done
+
+# Config
+PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
+MODEL="qwen2_5-0_5b"
+MAX_SEQ=1024
+PTQ="16a4w"
+
+EXTRA_FLAGS="$@"
+
+# Run command and capture *both stdout and stderr*
+LOG_FILE="eval_${MODEL}_$(date +%Y%m%d_%H%M%S).log"
+
+echo ">>> Running evaluation with flags: $EXTRA_FLAGS | threshold: $THRESHOLD"
+$PYTHON_EXECUTABLE -m executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn \
+  --decoder_model "$MODEL" \
+  --quant_linear_only \
+  --max_seq_length "$MAX_SEQ" \
+  --ptq "$PTQ" \
+  $EXTRA_FLAGS 2>&1 | tee "$LOG_FILE"
+
+# Extract last word_perplexity
+LAST_PERP=$(grep "INFO:root:wikitext:" "$LOG_FILE" | tail -n 1 | sed -E "s/.*'word_perplexity,none': ([0-9.]+).*/\1/")
+
+if [[ -z "$LAST_PERP" ]]; then
+  echo "❌ Could not find word_perplexity in logs!"
+  exit 1
+fi
+
+echo ">>> Last word_perplexity = $LAST_PERP"
+
+# Compare against threshold
+awk -v val="$LAST_PERP" -v thr="$THRESHOLD" 'BEGIN {exit (val > thr)}'
+if [[ $? -ne 0 ]]; then
+  echo "❌ Regression detected: word_perplexity ($LAST_PERP) > threshold ($THRESHOLD)"
+  exit 1
+fi
+
+echo "✅ Check passed: word_perplexity ($LAST_PERP) <= $THRESHOLD"
diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
index 3c9ac598f8f..da50d28800a 100644
--- a/.ci/scripts/test_torchao_huggingface_checkpoints.sh
+++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -5,6 +5,7 @@ set -euxo pipefail
 # Args / flags
 # -------------------------
 TEST_WITH_RUNNER=0
+USE_TORCHAO_KERNELS=0
 MODEL_NAME=""
 
 # Parse args
@@ -22,10 +23,14 @@ while [[ $# -gt 0 ]]; do
     --test_with_runner)
       TEST_WITH_RUNNER=1
       ;;
+    --use_torchao_kernels)
+      USE_TORCHAO_KERNELS=1
+      ;;
     -h|--help)
-      echo "Usage: $0 <model_name> [--test_with_runner]"
+      echo "Usage: $0 <model_name> [--test_with_runner] [--use_torchao_kernels]"
       echo "  model_name: qwen3_4b | phi_4_mini"
       echo "  --test_with_runner: build ET + run llama_main to sanity-check the export"
+      echo "  --use_torchao_kernels: use torchao kernels for linear and tied embedding"
       exit 0
       ;;
     *)
@@ -42,6 +47,13 @@ fi
 
 MODEL_OUT=model.pte
 
+
+# Default to XNNPACK
+BACKEND_ARGS="-X --xnnpack-extended-ops"
+if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then
+  BACKEND_ARGS="--use-torchao-kernels"
+fi
+
 case "$MODEL_NAME" in
   qwen3_4b)
     echo "Running Qwen3-4B export..."
@@ -58,12 +70,12 @@ case "$MODEL_NAME" in
       --output_name $MODEL_OUT \
       -kv \
       --use_sdpa_with_kv_cache \
-      -X \
-      --xnnpack-extended-ops \
       --max_context_length 1024 \
       --max_seq_length 1024 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \
+      --verbose \
       --dtype fp32 \
-      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+      ${BACKEND_ARGS}
     ;;
 
   phi_4_mini)
@@ -81,12 +93,12 @@ case "$MODEL_NAME" in
       --output_name $MODEL_OUT \
       -kv \
       --use_sdpa_with_kv_cache \
-      -X \
-      --xnnpack-extended-ops \
       --max_context_length 1024 \
       --max_seq_length 1024 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' \
+      --verbose \
       --dtype fp32 \
-      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+      ${BACKEND_ARGS}
     ;;
 
   *)
@@ -104,6 +116,10 @@ if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
 fi
 
 # Install ET with CMake
+EXECUTORCH_BUILD_KERNELS_TORCHAO="OFF"
+if [[ "$USE_TORCHAO_KERNELS" -eq 1 ]]; then
+  EXECUTORCH_BUILD_KERNELS_TORCHAO="ON"
+fi
 if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
   echo "[runner] Building and testing llama_main ..."
     cmake -DPYTHON_EXECUTABLE=python \
@@ -113,6 +129,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_BUILD_XNNPACK=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -120,6 +137,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
         -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+        -DEXECUTORCH_BUILD_KERNELS_TORCHAO=${EXECUTORCH_BUILD_KERNELS_TORCHAO} \
         -Bcmake-out .
     cmake --build cmake-out -j16 --config Release --target install
 
diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh
index 39c52a4a396..4207f0392be 100644
--- a/.ci/scripts/test_wheel_package_qnn.sh
+++ b/.ci/scripts/test_wheel_package_qnn.sh
@@ -98,7 +98,7 @@ PYTHON_VERSION=$1
 # Check wheel does NOT contain qualcomm/sdk
 # ----------------------------
 echo "Checking wheel does not contain qualcomm/sdk..."
-SDK_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep "executorch/backends/qualcomm/sdk" || true)
+SDK_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep -E "executorch/backends/qualcomm/sdk" || true)
 if [ -n "$SDK_FILES" ]; then
     echo "ERROR: Wheel package contains unexpected qualcomm/sdk files:"
     echo "$SDK_FILES"
@@ -111,7 +111,7 @@ fi
 # Check .so files in the wheel
 # ----------------------------
 echo "Checking for .so files inside the wheel..."
-WHEEL_SO_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep "executorch/backends/qualcomm/python" || true)
+WHEEL_SO_FILES=$(unzip -l "$WHEEL_FILE" | awk '{print $4}' | grep -E "executorch/backends/qualcomm/python" || true)
 if [ -z "$WHEEL_SO_FILES" ]; then
     echo "ERROR: No .so files found in wheel under executorch/backends/qualcomm/python"
     exit 1
@@ -139,12 +139,35 @@ run_core_tests () {
   echo "=== [$LABEL] Installing wheel & deps ==="
   "$PIPBIN" install --upgrade pip
   "$PIPBIN" install "$WHEEL_FILE"
-  "$PIPBIN" install torch=="2.9.0.dev20250906" --index-url "https://download.pytorch.org/whl/nightly/cpu"
-  "$PIPBIN" install --pre torchao --index-url "https://download.pytorch.org/whl/nightly/cpu"
+  TORCH_VERSION=$(
+  "$PYBIN" - <<'PY'
+import runpy
+module_vars = runpy.run_path("torch_pin.py")
+print(module_vars["TORCH_VERSION"])
+PY
+)
+
+  NIGHTLY_VERSION=$(
+  "$PYBIN" - <<'PY'
+import runpy
+module_vars = runpy.run_path("torch_pin.py")
+print(module_vars["NIGHTLY_VERSION"])
+PY
+)
+  echo "=== [$LABEL] Install torch==${TORCH_VERSION}.${NIGHTLY_VERSION} ==="
+
+  # Install torchao based on the pinned PyTorch version
+  "$PIPBIN" install torch=="${TORCH_VERSION}.${NIGHTLY_VERSION}" --index-url "https://download.pytorch.org/whl/nightly/cpu"
+
+  # Install torchao based on the pinned commit from third-party/ao submodule
+  pushd "$REPO_ROOT/third-party/ao" > /dev/null
+  USE_CPP=0 "$PYBIN" setup.py develop
+  popd > /dev/null
 
   echo "=== [$LABEL] Import smoke tests ==="
   "$PYBIN" -c "import executorch; print('executorch imported successfully')"
   "$PYBIN" -c "import executorch.backends.qualcomm; print('executorch.backends.qualcomm imported successfully')"
+  "$PYBIN" -c "from executorch.export.target_recipes import get_android_recipe; recipe = get_android_recipe('android-arm64-snapdragon-fp16'); print(f'executorch.export.target_recipes imported successfully: {recipe}')"
 
   echo "=== [$LABEL] List installed executorch/backends/qualcomm/python ==="
   local SITE_DIR
diff --git a/.ci/scripts/test_yolo12.sh b/.ci/scripts/test_yolo12.sh
index e3f20d5f970..594ddbf86ed 100755
--- a/.ci/scripts/test_yolo12.sh
+++ b/.ci/scripts/test_yolo12.sh
@@ -119,6 +119,8 @@ cmake_install_executorch_libraries() {
           -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -B"${build_dir}"
@@ -131,6 +133,8 @@ cmake_install_executorch_libraries() {
                        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
                        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+                       -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
                        -DEXECUTORCH_ENABLE_LOGGING=ON \
                        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
diff --git a/.ci/scripts/unittest-buck2.sh b/.ci/scripts/unittest-buck2.sh
index 340f7438f02..e78e682faac 100755
--- a/.ci/scripts/unittest-buck2.sh
+++ b/.ci/scripts/unittest-buck2.sh
@@ -15,7 +15,8 @@ buck2 query "//backends/apple/... + //backends/arm: + //backends/arm/debug/... +
 //backends/arm/_passes/... + //backends/arm/runtime/... + //backends/arm/tosa/... \
 + //backends/example/... + \
 //backends/mediatek/... + //backends/transforms/... + \
-//backends/xnnpack/... + //configurations/... + //extension/flat_tensor: + \
+//backends/xnnpack/... + //codegen/tools/... + \
+//configurations/... + //extension/flat_tensor: + \
 //extension/llm/runner: + //kernels/aten/... + //kernels/optimized/... + \
 //kernels/portable/... + //kernels/quantized/... + //kernels/test/... + \
 //runtime/... + //schema/... + //test/... + //util/..."
@@ -34,7 +35,17 @@ BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -
 for op in "build" "test"; do
     buck2 $op $BUILDABLE_OPTIMIZED_OPS \
           //examples/selective_build:select_all_dtype_selective_lib_portable_lib \
+          //extension/llm/custom_ops/spinquant/test:fast_hadamard_transform_test \
+          //extension/llm/runner/test:test_multimodal_input \
+          //extension/llm/runner/test:test_generation_config \
           //kernels/portable/... \
           $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
           //runtime/executor: //runtime/kernel/... //runtime/platform/...
 done
+
+# Build only without testing
+buck2 build //codegen/tools/... \
+        //extension/llm/runner/io_manager:io_manager \
+        //extension/llm/modules/... \
+        //extension/llm/runner:multimodal_runner_lib \
+        //extension/llm/runner:text_decoder_runner
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index f6f6ece786b..f896d3f1d40 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -125,14 +125,15 @@ build_executorch_runner_cmake() {
   clean_executorch_install_folders
   mkdir "${CMAKE_OUTPUT_DIR}"
 
-  pushd "${CMAKE_OUTPUT_DIR}" || return
   if [[ $1 == "Debug" ]]; then
       CXXFLAGS="-fsanitize=address,undefined"
   else
       CXXFLAGS=""
   fi
-  CXXFLAGS="$CXXFLAGS" retry cmake -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" -DCMAKE_BUILD_TYPE="${1:-Release}" ..
-  popd || return
+  CXXFLAGS="$CXXFLAGS" retry cmake \
+    -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
+    -DCMAKE_BUILD_TYPE="${1:-Release}" \
+    -B${CMAKE_OUTPUT_DIR} .
 
   if [ "$(uname)" == "Darwin" ]; then
     CMAKE_JOBS=$(( $(sysctl -n hw.ncpu) - 1 ))
diff --git a/.ci/scripts/wheel/test_base.py b/.ci/scripts/wheel/test_base.py
index f8a7309a6c2..278e46fe75a 100644
--- a/.ci/scripts/wheel/test_base.py
+++ b/.ci/scripts/wheel/test_base.py
@@ -41,6 +41,18 @@ class ModelTest:
 
 
 def run_tests(model_tests: List[ModelTest]) -> None:
+    # Test that we can import the portable_lib module - verifies RPATH is correct
+    print("Testing portable_lib import...")
+    try:
+        from executorch.extension.pybindings._portable_lib import (  # noqa: F401
+            _load_for_executorch,
+        )
+
+        print("✓ Successfully imported _load_for_executorch from portable_lib")
+    except ImportError as e:
+        print(f"✗ Failed to import portable_lib: {e}")
+        raise
+
     # Why are we doing this envvar shenanigans? Since we build the testers, which
     # uses buck, we cannot run as root. This is a sneaky of getting around that
     # test.
diff --git a/.github/scripts/cherry_pick.py b/.github/scripts/cherry_pick.py
index 1239ee030dd..8de5279f51b 100755
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@@ -39,7 +39,15 @@ def parse_args() -> Any:
     )
     parser.add_argument(
         "--classification",
-        choices=["regression", "critical", "fixnewfeature", "docs", "release"],
+        choices=[
+            "regression",
+            "critical",
+            "fixnewfeature",
+            "docs",
+            "release",
+            "examples",
+            "testci",
+        ],
         required=True,
         help="the cherry pick category",
     )
diff --git a/.github/scripts/propose_ghstack_orig_pr.py b/.github/scripts/propose_ghstack_orig_pr.py
index 53b796adaa3..3abcc6cdcf9 100644
--- a/.github/scripts/propose_ghstack_orig_pr.py
+++ b/.github/scripts/propose_ghstack_orig_pr.py
@@ -86,6 +86,17 @@ def get_pr_stack_from_number(ref: str, repo: Repository) -> List[int]:
     return pr_stack
 
 
+def get_differential_revision(pr, repo: Repository) -> str:
+    body = repo.get_pull(pr.number).body
+    matches = re.findall(r"Differential Revision: .*", body)
+    count = len(matches)
+    if count == 1:
+        # If there's more than one Differential Revision, let's just return empty
+        # so that we can disambiguate manually.
+        return matches[0]
+    return ""
+
+
 def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository):
     # For the first PR, we want to merge to `main` branch, and we will update
     # as we go through the stack
@@ -100,6 +111,7 @@ def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository):
         # The PR we want to create is then "branch_to_merge" <- gh/user/x/orig
         # gh/user/x/orig is the clean diff between gh/user/x/base <- gh/user/x/head
         orig_branch_merge_head = pr.base.ref.replace("base", "orig")
+        differential_revision_text = get_differential_revision(pr, repo)
         bot_metadata = f"""This PR was created by the merge bot to help merge the original PR into the main branch.
 ghstack PR number: https://github.com/pytorch/executorch/pull/{pr.number} by @{pr.user.login}
 ^ Please use this as the source of truth for the PR details, comments, and reviews
@@ -107,6 +119,7 @@ def create_prs_for_orig_branch(pr_stack: List[int], repo: Repository):
 ghstack PR head: https://github.com/pytorch/executorch/tree/{pr.head.ref}
 Merge bot PR base: https://github.com/pytorch/executorch/tree/{orig_branch_merge_base}
 Merge bot PR head: https://github.com/pytorch/executorch/tree/{orig_branch_merge_head}
+{differential_revision_text}
 @diff-train-skip-merge"""
 
         existing_orig_pr = repo.get_pulls(
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index 2449e94b2af..7b67c340350 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -48,26 +48,13 @@ jobs:
         bash examples/models/llama/install_requirements.sh
         bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
 
-        mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
-        cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
-        pushd examples/demo-apps/android/LlamaDemo
-        ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest
-        popd
-
-        DEMO_APP_DIR="${ARTIFACTS_DIR_NAME}/llm_demo"
-        # The app directory is named using its build flavor as a suffix.
-        mkdir -p "${DEMO_APP_DIR}"
-        # Collect the app and its test suite
-        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk "${DEMO_APP_DIR}"
-        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk "${DEMO_APP_DIR}"
-
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
     needs: build-llm-demo
     # NB: Use metal install for KVM support to run the emulator faster
     runs-on: linux.24xl.spr-metal
     env:
-      ANDROID_NDK_VERSION: r27b
+      ANDROID_NDK_VERSION: r28c
       API_LEVEL: 34
     steps:
       - name: Setup SSH (Click me for login details)
@@ -103,8 +90,6 @@ jobs:
         shell: bash
         run: |
           set -eux
-          curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
-          curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
           curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
           curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch_android-debug-androidTest.apk
           unzip model.zip
diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
index 5f41faa8cc7..ec426af8892 100644
--- a/.github/workflows/_test_backend.yml
+++ b/.github/workflows/_test_backend.yml
@@ -57,7 +57,7 @@ jobs:
       script: |
         set -eux
 
-        source .ci/scripts/test_backend_linux.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
 
   test-backend-macos:
     if: ${{ inputs.run-macos }}
@@ -81,4 +81,4 @@ jobs:
         # This is needed to get the prebuilt PyTorch wheel from S3
         ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
 
-        source .ci/scripts/test_backend_macos.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
+        source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
diff --git a/.github/workflows/add-unanswered-to-project.yml b/.github/workflows/add-unanswered-to-project.yml
index ba2bc6c8436..8b8114d0c04 100644
--- a/.github/workflows/add-unanswered-to-project.yml
+++ b/.github/workflows/add-unanswered-to-project.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Add open issues and open, non-draft PRs to org project (excluding certain authors)
         uses: actions/github-script@v7
         with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
+          github-token: ${{ secrets.ET_EXT_CONTRIB }}
           script: |
             const projectId = "PVT_kwDOAUB9vs4A_PUL"; // PyTorch org project 136
             const owner = 'pytorch';
@@ -20,20 +20,31 @@ jobs:
 
             // List of authors to exclude
             const excludedAuthors = new Set([
-              "nil-is-all", "cbilgin", "KimishPatel", "psiddh", "digantdesai", "SS-JIA", "ahmtox", "mcr229", "shoumikhin",
+              "nil-is-all", "cbilgin", "kimishpatel", "psiddh", "digantdesai", "SS-JIA", "ahmtox", "mcr229", "shoumikhin",
               "manuelcandales", "metascroy", "cccclai", "rohansjoshi", "kirklandsign", "abhinaykukkadapu", "JacobSzwejbka",
               "Conarnar", "lucylq", "larryliu0820", "BujSet", "Gasoonjia", "Juntian777", "guangy10", "jackzhxng",
               "GregoryComer", "leafs1", "swolchok", "mergennachin", "tarun292", "byjlw", "jathu", "Jack-Khuu", "georgehong",
-              "zhenyan-zhang-meta", "silverguo", "dbort", "jorgep31415", "huydhn", "mcremon-meta", "trivedivivek", "angelayi",
-              "helunwencser", "hsharma35", "zhxchen17", "iseeyuan", "svekars", "nathanaelsee", "dulinriley", "jerryzh168",
+              "zhenyan-zhang-meta", "silverguo", "harishs88ss", "AlannaBurke", "dbort", "huydhn", "mcremon-meta", "trivedivivek", 
+              "angelayi", "helunwencser", "hsharma35", "zhxchen17", "iseeyuan", "svekars", "nathanaelsee", "dulinriley", "jerryzh168",
               "cmodi-meta", "bigfootjon", "sxu", "ydwu4", "Riandy", "tugsbayasgalan", "bsoyluoglu", "yangw-dev", "YIWENX14",
               "namanahuja", "yushangdi", "limintang", "pianpwk", "viveknayakatmeta", "andreanicastro", "JakeStevens",
-              "gmagogsfm", "zonglinpeng", "eigen-k", "derekxu", "salilsdesai", "skrtskrtfb", "pssrawat", "r-barnes", "pytorchbot",
-              "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "Erik-Lundell", "zingo", "AdrianLundell",
-              "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils", "martinlsm", "freddan80",
-              "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind", "haowhsu-quic", "shewu-quic",
-              "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "cymbalrush", "DenisVieriu97", "billmguo",
-              "StrycekSimon", "jirioc", "robert-kalmar", "skywall", "neuropilot-captain"
+              "gmagogsfm", "zonglinpeng", "eigen-k", "derekxu", "salilsdesai", "skrtskrtfb", "pssrawat", "r-barnes",
+              "kalpit-meta-1", "Will-MingLun-Li", "KapJI", "piyengar", "j-bahr", "BoyuanFeng", "fgasperij", "DariusHolmgren",
+              "sammarden-meta", "kushrast", "meta-emilian", "Rittzz", "jeanschmidt", "copyrightly", "mikekgfb", "vmpuri",
+              "zonglinpengmeta", "maggiemoss", "aorenste", "hoangminhle98", "Solumin", "meyering", "rchen152",
+              "AishwaryaSivaraman", "migeed-z", "ebgraham", "Esteb37", "nausicaasnow", "Camyll", "ezyang", "huiyujie",
+              "dltn", "cjhopman", "blackm00n", "agunapal", "SamGondelman", "Ninja91", "ivayloen", "DrJessop", "rodrigos01meta",
+              "akrieger", "cmt0", "yiming0416", "ethansfng", "ThomasJannaud", "nirvanagth", "marcinkwiatkowski", "3l1",
+              "omerjerk", "nitish2112", "yipjustin", "ejnguyen", "andrewor14", "phaiting", "mgiordy", "LeeOHzzZ", "adicatana",
+              "Polyomino", "ezrilow", "navsud", "YifanShenSZ", "RdoubleA", "Olivia-liu", "Abhi-hpp", "Vysarat", "azad-meta",
+              "pytorchbot", "pytorchmergebot", "pytorchupdatebot", "facebook-github-bot", "app/dependabot", "Erik-Lundell",
+              "zingo", "AdrianLundell", "oscarandersson8218", "per", "Sebastian-Larsson", "SaoirseARM", "robell", "mansnils",
+              "martinlsm", "freddan80", "YufengShi-dudu", "tom-arm", "perheld", "Jerry-Ge", "gggekov", "fumchin", "wwwind",
+              "benkli01", "Tessil", "maddun01", "Michiel-Olieslagers", "armwaheed", "agrima1304", "emmakujala", "annietllnd", 
+              "haowhsu-quic", "shewu-quic", "winskuo-quic", "chunit-quic", "DannyYuyang-quic", "chuntl", "thchenqti", 
+              "jethroqti", "cymbalrush", "DenisVieriu97", "billmguo", "StrycekSimon", "jirioc", "robert-kalmar", "skywall", 
+              "MartinPavella", "roman-janik-nxp", "novak-vaclav ", "neuropilot-captain", "dijopaul", "cad-rlc", "cad-audio", 
+              "ynimmaga", "daniil-lyakhov", "emmanuel-ferdman", "cavusmustafa", "Jiseong-oh", "alexdean08"
             ]);
 
             async function addItem(contentId, type, number) {
@@ -80,11 +91,10 @@ jobs:
                   owner,
                   repo,
                   state: 'open',
-                  draft: false,
                 }
               );
               for (const pr of prs) {
-                if (!excludedAuthors.has(pr.user.login)) {
+                if (!pr.draft && !excludedAuthors.has(pr.user.login)) {
                   await addItem(pr.node_id, 'pr', pr.number);
                 }
               }
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
index f0b74342eb8..beda0f77c83 100644
--- a/.github/workflows/android-release-artifacts.yml
+++ b/.github/workflows/android-release-artifacts.yml
@@ -15,15 +15,11 @@ on:
         type: choice
         options:
           - "xnnpack"
-          - "vulkan+xnnpack"
+          - "vulkan"
           - "qnn"
   schedule:
     - cron: 0 10 * * *
 
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
 jobs:
   check-if-aar-exists:
     name: check-if-aar-exists
@@ -34,12 +30,13 @@ jobs:
         shell: bash
         run: |
           VERSION="${{ inputs.version }}"
+          FLAVOR="${{ inputs.flavor }}"
           if [ -z "$VERSION" ]; then
             echo "No version name specified. Will create a snapshot AAR"
             exit 0
           fi
-          if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar" | grep "200 OK"; then
-            echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}/executorch.aar"
+          if curl -I "https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}-${FLAVOR}/executorch.aar" | grep "200 OK"; then
+            echo "AAR already exists at https://ossci-android.s3.amazonaws.com/executorch/release/${VERSION}-${FLAVOR}/executorch.aar"
             echo "Will skip build/upload"
             exit 1
           fi
@@ -93,7 +90,14 @@ jobs:
         fi
 
         FLAVOR="${{ inputs.flavor }}"
-        if [[ "$FLAVOR" == "vulkan+xnnpack" || -z "$FLAVOR" ]]; then
+        if [ ! -z "$FLAVOR" ]; then
+          GRADLE_ARGS+=" -Dflavor=${FLAVOR}"
+        fi
+
+        if [[ "$FLAVOR" == "vulkan" || -z "$FLAVOR" ]]; then
+          curl -O https://sdk.lunarg.com/sdk/download/1.4.321.1/linux/vulkansdk-linux-x86_64-1.4.321.1.tar.xz
+          tar xf vulkansdk-linux-x86_64-1.4.321.1.tar.xz -C /tmp
+          export PATH="/tmp/1.4.321.1/x86_64/bin:$PATH"
           export EXECUTORCH_BUILD_VULKAN=ON
         fi
 
@@ -145,8 +149,12 @@ jobs:
           pip install awscli==1.32.18
           AWS_CMD="aws s3 cp"
           VERSION="${{ inputs.version }}"
+          FLAVOR="${{ inputs.flavor }}"
           if [ -z "$VERSION" ]; then
             VERSION="snapshot-$(date +"%Y%m%d")"
           fi
-          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}/executorch.aar --acl public-read
-          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}/executorch.aar.sha256sums --acl public-read
+          if [ -z "$FLAVOR" ]; then
+            FLAVOR="xnnpack"
+          fi
+          ${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION}-${FLAVOR}/executorch.aar --acl public-read
+          ${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION}-${FLAVOR}/executorch.aar.sha256sums --acl public-read
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
new file mode 100644
index 00000000000..c1b22e692ab
--- /dev/null
+++ b/.github/workflows/cuda.yml
@@ -0,0 +1,282 @@
+# Test ExecuTorch CUDA Build Compatibility
+# This workflow tests whether ExecuTorch can be successfully built with CUDA support
+# across different CUDA versions (12.6, 12.8, 12.9) using the command:
+# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+#
+# Note: ExecuTorch automatically detects the system CUDA version using nvcc and
+# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
+
+name: Test CUDA Builds
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: false
+
+jobs:
+  test-cuda-builds:
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda-version: ["12.6", "12.8", "13.0"]
+
+    name: test-executorch-cuda-build-${{ matrix.cuda-version }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda-version }}
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        # Test ExecuTorch CUDA build - ExecuTorch will automatically detect CUDA version
+        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+        source .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
+
+  # This job will fail if any of the CUDA versions fail
+  check-all-cuda-builds:
+    needs: test-cuda-builds
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Check if all CUDA builds succeeded
+        run: |
+          if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then
+            echo "ERROR: One or more ExecuTorch CUDA builds failed!"
+            echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
+            exit 1
+          else
+            echo "SUCCESS: All ExecuTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
+          fi
+
+  test-models-cuda:
+    name: test-models-cuda
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        model: [linear, add, add_mul, resnet18]
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        PYTHON_EXECUTABLE=python CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
+
+  export-voxtral-cuda-artifact:
+    name: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
+        echo "::endgroup::"
+
+        echo "::group::Store Voxtral Artifacts"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
+
+  benchmark-voxtral-cuda:
+    name: benchmark-voxtral-cuda
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Benchmark"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target voxtral_runner
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/voxtral_runner model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Download Test Audio File"
+        AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        curl -L $AUDIO_URL -o poem.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Runner"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path poem.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "poem"; then
+          echo "Expected output 'poem' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 585522a8d01..540c6cc05f6 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        runner: [linux.2xlarge]
+        runner: [linux.4xlarge]
         docker-image-name: [
           executorch-ubuntu-22.04-gcc9,
           executorch-ubuntu-22.04-clang12,
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index ac9d1c7e6a0..a9d0f466e55 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -148,8 +148,6 @@ jobs:
           extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/*.java \
           extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/*.java \
           extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/*.java \
-          examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \
-          examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/*.java \
           extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java \
           extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench/*.java)
         if [ -n "$FILES_NEEDS_FORMAT" ]; then
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index d8c551e8982..5b646cba9d1 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -286,15 +286,20 @@ jobs:
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
 
-  test-llava-runner-linux:
-    name: test-llava-runner-linux
+  test-multimodal-linux:
+    if: ${{ !github.event.pull_request.head.repo.fork }}
+    name: test-multimodal-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
+    secrets: inherit
     strategy:
       fail-fast: false
+      matrix:
+        model: ["gemma3-4b"]  # llava gives segfault so not covering.
     with:
+      secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.24xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
       submodules: 'recursive'
@@ -305,17 +310,20 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        echo "::group::Setup ExecuTorch"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+        echo "::endgroup::"
 
-        # install Llava requirements
-        bash examples/models/llama/install_requirements.sh
-        bash examples/models/llava/install_requirements.sh
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
 
-        # run python unittest
-        python -m unittest examples.models.llava.test.test_llava
-
-        # run e2e (export, tokenizer and runner)
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
+        echo "::group::Test ${{ matrix.model }}"
+        python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack
+        echo "::endgroup::"
 
   test-moshi-linux:
     name: test-moshi-linux
@@ -738,8 +746,8 @@ jobs:
         # Install llama requirements
         bash examples/models/llama/install_requirements.sh
 
-        # install a recent version of torchtune.
-        PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250730  --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+        # install a recent version of torchtune (>= 20250730)
+        PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250929  --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 
         # run llama runner in eager mode
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_lora.sh
@@ -779,7 +787,6 @@ jobs:
       contents: read
     strategy:
       fail-fast: false
-    if: false # TODO Re-enable after fixing timeouts (#14314)
     with:
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-gcc9
@@ -900,7 +907,9 @@ jobs:
     permissions:
       id-token: write
       contents: read
+    secrets: inherit
     with:
+      secrets-env: SAMSUNG_AI_LITECORE_KEY
       runner: linux.2xlarge
       docker-image: ci-image:executorch-ubuntu-22.04-clang12-android
       submodules: 'recursive'
@@ -917,6 +926,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # Setup Samsung SDK (AI Lite Core) and install enn backend
+        export SAMSUNG_AI_LITECORE_KEY=$SECRET_SAMSUNG_AI_LITECORE_KEY
         source .ci/scripts/setup-samsung-linux-deps.sh
 
         # Test models serially
@@ -925,6 +935,12 @@ jobs:
           python -m executorch.examples.samsung.aot_compiler --model_name=$model -c E9955
         done
 
+        # Test quant models
+        model_scripts="deeplab_v3 edsr inception_v3 inception_v4 mobilenet_v2 mobilenet_v3 resnet18 resnet50 vit wav2letter"
+        for m_script in $model_scripts; do
+          python -m executorch.examples.samsung.scripts.${m_script} -c e9955 -p A8W8
+        done
+
         # Test ops
         python -m unittest discover -s backends/samsung/test/ops -p "test_*.py"
 
@@ -959,11 +975,16 @@ jobs:
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build
 
         # Test models serially
-        models="mv2 mv3 edsr resnet18 resnet50 dl3"
+        models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4"
         for model in $models; do
           python -m examples.vulkan.export --model_name=$model --test
         done
 
+        # For selected vision models, test with dynamic shapes
+        models="mv2 resnet18 resnet50 ic3 densenet161"
+        for model in $models; do
+          python -m examples.vulkan.export --model_name=$model --test -d
+        done
 
   test-vulkan-operators-linux:
     name: test-vulkan-operators-linux
@@ -998,6 +1019,8 @@ jobs:
         ./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
         ./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
         ./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
+        ./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations
+        ./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
 
         # "Classic" Operator tests
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
diff --git a/.github/workflows/test-backend-arm.yml b/.github/workflows/test-backend-arm.yml
index e57be2704a2..22e3d524f6b 100644
--- a/.github/workflows/test-backend-arm.yml
+++ b/.github/workflows/test-backend-arm.yml
@@ -4,12 +4,17 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
     paths:
       - .github/workflows/test-backend-arm.yml
       - .github/workflows/_test_backend.yml
+      - .ci/scripts/test_backend.sh
+      - backends/test/suite/flow.py
+      - backends/test/suite/flows/arm.py
   workflow_dispatch:
 
 concurrency:
@@ -21,7 +26,7 @@ jobs:
     uses: ./.github/workflows/_test_backend.yml
     with:
       backend: arm
-      flows: '["arm_tosa"]'
+      flows: '["arm_tosa_fp", "arm_tosa_int", "arm_ethos_u55", "arm_ethos_u85"]'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 120
       run-linux: true
diff --git a/.github/workflows/test-backend-coreml.yml b/.github/workflows/test-backend-coreml.yml
index c6970ddff61..247f9576595 100644
--- a/.github/workflows/test-backend-coreml.yml
+++ b/.github/workflows/test-backend-coreml.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-qnn.yml b/.github/workflows/test-backend-qnn.yml
index 00933d6c74e..907c4d2dac0 100644
--- a/.github/workflows/test-backend-qnn.yml
+++ b/.github/workflows/test-backend-qnn.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-vulkan.yml b/.github/workflows/test-backend-vulkan.yml
index f04fdcdd1f1..cb2478fc825 100644
--- a/.github/workflows/test-backend-vulkan.yml
+++ b/.github/workflows/test-backend-vulkan.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/test-backend-xnnpack.yml b/.github/workflows/test-backend-xnnpack.yml
index 2ae423dd99b..086c9625a38 100644
--- a/.github/workflows/test-backend-xnnpack.yml
+++ b/.github/workflows/test-backend-xnnpack.yml
@@ -4,6 +4,8 @@ on:
   schedule:
     - cron: 0 2 * * *
   push:
+    branches:
+      - release/*
     tags:
       - ciflow/nightly/*
   pull_request:
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 975a8ebbb30..8add54af49c 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -289,6 +289,7 @@ jobs:
           - test_arm_baremetal: test_models_ethos-u55
           - test_arm_baremetal: test_models_ethos-u85
           - test_arm_baremetal: test_smaller_stories_llama
+          - test_arm_baremetal: test_memory_allocation
       fail-fast: false
     with:
       runner: linux.2xlarge.memory
@@ -345,7 +346,7 @@ jobs:
         elif [[ ${{ matrix.os}} == "zephyr-preset" ]]; then
           setup_script_args="--target-toolchain zephyr"
           toolchain_prefix=arm-zephyr-eabi-
-          threshold="135168" # 132 KiB
+          threshold="135240" # 132 KiB
           toolchain_cmake=examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake
         else
           echo "Fail unsupport OS selection ${{ matrix.os }}"
@@ -594,15 +595,22 @@ jobs:
     strategy:
       matrix:
         model: [qwen3_4b, phi_4_mini]
+        runner: [linux.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12]
+        backend: [xnnpack]
         include:
           - model: qwen3_4b
-            test_with_runner: true
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+            backend: torchao
           - model: phi_4_mini
-            test_with_runner: false
+            runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+            backend: torchao
       fail-fast: false
     with:
-      runner: linux.2xlarge
-      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      runner: ${{ matrix.runner }}
+      docker-image: ci-image:${{ matrix.docker-image }}
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
@@ -612,38 +620,54 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+        if [[ "${{ matrix.backend }}" == "torchao" ]]; then
+          BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install third-party/ao
+        fi
+
         pip install -U "huggingface_hub[cli]"
 
-        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
-
-  # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
-  # test-llava-runner-macos:
-  #   name: test-llava-runner-macos
-  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-  #   strategy:
-  #     fail-fast: false
-  #   with:
-  #     runner: macos-14-xlarge
-  #     python-version: '3.11'
-  #     submodules: 'recursive'
-  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-  #     timeout: 900
-  #     script: |
-  #       BUILD_TOOL=cmake
-
-  #       bash .ci/scripts/setup-conda.sh
-  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
-  #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
-
-  #       # install Llava requirements
-  #       ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
-  #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
-
-  #       # run python unittest
-  #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
-
-  #       # run e2e (export, tokenizer and runner)
-  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh
+        bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.model != 'phi_4_mini' && '--test_with_runner' || '' }}  ${{ matrix.backend == 'torchao' && '--use_torchao_kernels' || '' }}
+
+  test-multimodal-macos:
+    if: ${{ !github.event.pull_request.head.repo.fork }}
+    name: test-multimodal-macos
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    strategy:
+      fail-fast: false
+      matrix:
+        model: ["gemma3-4b"] # llava gives segfault so not covering.
+    with:
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: macos-15-xlarge
+      python-version: '3.11'
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        echo "::group::Set up ExecuTorch"
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        echo "::endgroup::"
+
+        echo "::group::Set up Huggingface"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        ${CONDA_RUN} pip list
+        echo "::endgroup::"
+
+        echo "::group::Test ${{ matrix.model }}"
+        ${CONDA_RUN} python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack
+        echo "::endgroup::"
 
   test-qnn-model:
     name: test-qnn-model
@@ -800,11 +824,26 @@ jobs:
         echo "Recipe: $RECIPE"
         echo "Quantize: $QUANTIZE"
 
-        echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+        echo "::group::Setup ExecuTorch"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        echo "::group::Test MODEL: $MODEL RECIPE: $RECIPE QUANTIZE: $QUANTIZE"
+        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
+        python .ci/scripts/test_huggingface_optimum_model.py --model "$MODEL" --recipe "$RECIPE" $QUANTIZE --model_dir "$OUTPUT_DIR"
+        echo "::endgroup::"
+
         # Build executor_runner with ETdump enabled
         PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
@@ -813,6 +852,7 @@ jobs:
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -DEXECUTORCH_BUILD_XNNPACK=ON \
           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -822,25 +862,6 @@ jobs:
           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
           -Bcmake-out .
         cmake --build cmake-out -j16 --target install --config Release
-        echo "::endgroup::"
-
-        echo "::group::Set up Hugging Face"
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        git clone https://github.com/huggingface/optimum-executorch
-        pushd optimum-executorch
-        # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout $OPTIMUM_ET_COMMIT
-        python install_dev.py --skip_override_torch
-        popd
-        pip list
-        echo "::endgroup::"
-
-        echo "::group::Run tests"
-        export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
-        python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
-        echo "::endgroup::"
 
         echo "::group::Generate artifacts for performance profiling"
         ./cmake-out/executor_runner \
@@ -907,16 +928,11 @@ jobs:
         ${CONDA_RUN} python install_executorch.py
         echo "::endgroup::"
 
-        echo "::group::Set up Hugging Face"
-        pip install -U "huggingface_hub[cli]"
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        git clone https://github.com/huggingface/optimum-executorch
-        pushd optimum-executorch
-        # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout $OPTIMUM_ET_COMMIT
-        ${CONDA_RUN} python install_dev.py --skip_override_torch
-        popd
+        echo "::group::Set up Huggingface"
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         ${CONDA_RUN} pip list
         echo "::endgroup::"
 
@@ -962,6 +978,60 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
+  # this is for filtering out the qnn changes such that qnn jobs only triggered when the specific files are changed
+  changes:
+    runs-on: ubuntu-latest
+    outputs:
+      qnn: ${{ steps.filter.outputs.qnn }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            qnn:
+              - 'backends/qualcomm/**'
+              - 'examples/qualcomm/**'
+              - 'examples/models/llama/**'
+
+  test-static-llama-qnn-eval-linux:
+    needs: changes # has dependency on changes jobs defined above
+    if: needs.changes.outputs.qnn == 'true'
+    name: test-static-llama-qnn-eval-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: "baseline"
+            flags: ""
+            threshold: 62.0
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 180
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        BUILD_TOOL="cmake"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "${BUILD_TOOL}"
+        # Setup install_requirements for llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+
+        echo ">>> Running config: ${{ matrix.config.name }}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama_eval.sh \
+          --flags "${{ matrix.config.flags }}" \
+          --threshold "${{ matrix.config.threshold }}"
+
   unittest-release:
     uses: ./.github/workflows/_unittest.yml
     permissions:
@@ -1016,8 +1086,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
-        backend: [portable, xnnpack-f32, xnnpack-q8]
+        model: [mv3, resnet50, vit, mobilebert, emformer_transcribe]
+        backend: [portable, xnnpack-q8]
     with:
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 0b6a6eb8908..b366c141799 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -206,6 +206,7 @@ exclude_patterns = [
     '**/*.png',
     '**/*.webp',
     '**/*.jpeg',
+    '**/*.mp3',
     '**/*.mp4',
     '**/*.pte',
     '**/*.pth',
@@ -216,6 +217,9 @@ exclude_patterns = [
     '**/*.jpg',
     '**/*.jar',
     '**/*.gif',
+    'extension/llm/tokenizers',
+    'extension/llm/tokenizers/**',
+    'examples/cuda',
     # File contains @generated
     'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc427d517a9..10e2eb437e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -226,7 +226,7 @@ if(EXECUTORCH_BUILD_CPUINFO)
   install(
     TARGETS cpuinfo
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -266,10 +266,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   executorch_move_interface_include_directories_to_build_time_only(
     pthreadpool_interface
   )
+
+  if(APPLE)
+    # Use hidden visibility for pthreadpool on Apple platforms to avoid issues
+    # with pthreadpool symbols from libtorch_cpu taking precedence over the ones
+    # from the pthreadpool library statically linked in _portable_lib. The
+    # pthreadpool public APIs are marked as weak by default on some Apple
+    # platforms, so setting to hidden visibility works around this by not
+    # putting the symbol in the indirection table. See
+    # https://github.com/pytorch/executorch/issues/14321 for more details.
+    target_compile_options(pthreadpool PRIVATE -fvisibility=hidden)
+  endif()
+
   install(
     TARGETS pthreadpool pthreadpool_interface fxdiv
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -284,7 +296,10 @@ if(EXECUTORCH_BUILD_TESTS)
 endif()
 
 # TODO(dbort): Fix these warnings and remove this flag.
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 
 # Let files say "include <executorch/path/to/header.h>".
 # TODO(#6475): This requires/assumes that the repo lives in a directory named
@@ -587,6 +602,16 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
@@ -630,6 +655,11 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   list(APPEND _executorch_extensions extension_module_static)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/named_data_map)
+  list(APPEND _executorch_extensions extension_named_data_map)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
     set(SUPPORT_REGEX_LOOKAHEAD ON)
@@ -650,15 +680,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM)
   list(APPEND _executorch_extensions tokenizers)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
-  list(APPEND _executorch_extensions extension_llm_runner)
-endif()
-
-if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
   install(
@@ -717,7 +738,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
   install(
     TARGETS torchao_ops_executorch torchao_kernels_aarch64
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${_common_include_directories}
   )
@@ -728,7 +749,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
     install(
       TARGETS kleidiai
       EXPORT ExecuTorchTargets
-      DESTINATION lib
+      DESTINATION ${CMAKE_INSTALL_LIBDIR}
       INCLUDES
       DESTINATION ${_common_include_directories}
     )
@@ -738,9 +759,6 @@ endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
 
-  # Add codegen tools subdirectory for selective_build pybind module
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/codegen/tools)
-
   if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
   endif()
@@ -749,6 +767,9 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
   endif()
 
+  # Add codegen tools subdirectory for selective_build pybind module
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/codegen/tools)
+
   # Create bundled_module target only for pybindings when bundled_program exists
   # This target has hard dependencies on devtools generated headers
   if(TARGET bundled_program)
@@ -769,7 +790,10 @@ if(EXECUTORCH_BUILD_PYBIND)
       bundled_module PUBLIC ${_common_include_directories}
     )
     target_compile_options(
-      bundled_module PUBLIC -Wno-deprecated-declarations -fPIC
+      bundled_module
+      PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+             $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+             -fPIC>
     )
   endif()
 
@@ -841,8 +865,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # compile options for pybind
-  set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions
+  set(_pybind_compile_options
+      $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+      /GR
+      /wd4996>
+      $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+      -fPIC
+      -frtti
+      -fexceptions>
   )
 
   # util lib
@@ -869,6 +899,21 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(portable_lib PRIVATE ${_dep_libs})
 
+  # Set RPATH to find PyTorch libraries relative to the installation location
+  # This goes from executorch/extension/pybindings up to site-packages, then to
+  # torch/lib
+  if(APPLE)
+    set_target_properties(
+      portable_lib PROPERTIES BUILD_RPATH "@loader_path/../../../torch/lib"
+                              INSTALL_RPATH "@loader_path/../../../torch/lib"
+    )
+  else()
+    set_target_properties(
+      portable_lib PROPERTIES BUILD_RPATH "$ORIGIN/../../../torch/lib"
+                              INSTALL_RPATH "$ORIGIN/../../../torch/lib"
+    )
+  endif()
+
   install(
     TARGETS portable_lib
     EXPORT ExecuTorchTargets
@@ -889,6 +934,15 @@ if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   list(APPEND _executorch_extensions extension_training)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
+  list(APPEND _executorch_extensions extension_llm_runner)
+endif()
+
+if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_LLM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
@@ -984,7 +1038,7 @@ if(NOT EXECUTORCH_SELECT_OPS_YAML STREQUAL ""
   install(
     TARGETS executorch_selected_kernels
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 else()
   # No selective build - link the full library.
@@ -1006,6 +1060,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/CMakePresets.json b/CMakePresets.json
index bcf3bbc8d83..379f4f418ed 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -63,7 +63,8 @@
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
         "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/ios.cmake",
         "PLATFORM": "OS64",
-        "DEPLOYMENT_TARGET": "17.0"
+        "DEPLOYMENT_TARGET": "17.0",
+        "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0"
       },
       "condition": {
         "lhs": "${hostSystemName}",
@@ -80,7 +81,8 @@
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
         "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/ios.cmake",
         "PLATFORM": "SIMULATORARM64",
-        "DEPLOYMENT_TARGET": "17.0"
+        "DEPLOYMENT_TARGET": "17.0",
+        "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0"
       },
       "condition": {
         "lhs": "${hostSystemName}",
diff --git a/CODEOWNERS b/CODEOWNERS
index 10baed9ede4..11f3ca07615 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -14,6 +14,7 @@
 /backends/transforms @kimishpatel
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
+/backends/nxp @robert-kalmar
 
 /devtools @Gasoonjia
 
@@ -33,6 +34,7 @@
 /examples/qualcomm @cccclai
 /examples/selective_build @lucylq @larryliu0820 @JacobSzwejbka
 /examples/xnnpack @digantdesai @mcr229
+/examples/nxp @robert-kalmar
 
 /exir/backend @cccclai @kimishpatel @JacobSzwejbka
 /exir @JacobSzwejbka @larryliu0820
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2f4de863dad..45e03bd36e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -199,8 +199,7 @@ We use [`lintrunner`](https://pypi.org/project/lintrunner/) to help make sure th
 code follows our standards. Set it up with:
 
 ```
-pip install lintrunner==0.12.7
-pip install lintrunner-adapters==0.12.4
+./install_requirements.sh  # (automatically run by install_executorch.sh)
 lintrunner init
 ```
 
diff --git a/README-wheel.md b/README-wheel.md
index a59af8ea05f..7ae9b0aa2e0 100644
--- a/README-wheel.md
+++ b/README-wheel.md
@@ -25,6 +25,6 @@ tutorials and documentation. Here are some starting points:
 * [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
   * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and
     optimizing its performance using quantization and hardware delegation.
-* Running etLLM on [iOS](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) and [Android](docs/source/llm/llama-demo-android.md) devices.
+* Running etLLM on [iOS](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) and [Android](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android) devices.
   * Build and run LLaMA in a demo mobile app, and learn how to integrate models
     with your own apps.
diff --git a/README.md b/README.md
index 17327990a1d..531fcc3b4ef 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,250 @@
 <div align="center">
-  <img src="docs/source/_static/img/et-logo.png" alt="Logo" width="200">
-  <h1 align="center">ExecuTorch: A powerful on-device AI Framework</h1>
+  <img src="docs/source/_static/img/et-logo.png" alt="ExecuTorch logo mark" width="200">
+  <h1>ExecuTorch</h1>
+  <p><strong>On-device AI inference powered by PyTorch</strong></p>
 </div>
 
-
 <div align="center">
-  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
-  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-  <a href="https://pytorch.org/executorch/main/index"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
-  <hr>
+  <a href="https://pypi.org/project/executorch/"><img src="https://img.shields.io/pypi/v/executorch?style=for-the-badge&color=blue" alt="PyPI - Version"></a>
+  <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="GitHub - Contributors"></a>
+  <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="GitHub - Stars"></a>
+  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-blue?logo=discord&logoColor=white&style=for-the-badge" alt="Discord - Chat with Us"></a>
+  <a href="https://docs.pytorch.org/executorch/main/index.html"><img src="https://img.shields.io/badge/Documentation-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Documentation"></a>
 </div>
 
-**ExecuTorch** is an end-to-end solution for on-device inference and training. It powers much of Meta's on-device AI experiences across Facebook, Instagram, Meta Quest, Ray-Ban Meta Smart Glasses, WhatsApp, and more.
+**ExecuTorch** is PyTorch's unified solution for deploying AI models on-device—from smartphones to microcontrollers—built for privacy, performance, and portability. It powers Meta's on-device AI across **Instagram, WhatsApp, Quest 3, Ray-Ban Meta Smart Glasses**, and [more](https://docs.pytorch.org/executorch/main/success-stories.html).
+
+Deploy **LLMs, vision, speech, and multimodal models** with the same PyTorch APIs you already know—accelerating research to production with seamless model export, optimization, and deployment. No manual C++ rewrites. No format conversions. No vendor lock-in.
+
+<details>
+  <summary><strong>📘 Table of Contents</strong></summary>
+
+- [Why ExecuTorch?](#why-executorch)
+- [How It Works](#how-it-works)
+- [Quick Start](#quick-start)
+  - [Installation](#installation)
+  - [Export and Deploy in 3 Steps](#export-and-deploy-in-3-steps)
+  - [Run on Device](#run-on-device)
+  - [LLM Example: Llama](#llm-example-llama)
+- [Platform & Hardware Support](#platform--hardware-support)
+- [Production Deployments](#production-deployments)
+- [Examples & Models](#examples--models)
+- [Key Features](#key-features)
+- [Documentation](#documentation)
+- [Community & Contributing](#community--contributing)
+- [License](#license)
+
+</details>
+
+## Why ExecuTorch?
+
+- **🔒 Native PyTorch Export** — Direct export from PyTorch. No .onnx, .tflite, or intermediate format conversions. Preserve model semantics.
+- **⚡ Production-Proven** — Powers billions of users at [Meta with real-time on-device inference](https://engineering.fb.com/2025/07/28/android/executorch-on-device-ml-meta-family-of-apps/).
+- **💾 Tiny Runtime** — 50KB base footprint. Runs on microcontrollers to high-end smartphones.
+- **🚀 [12+ Hardware Backends](https://docs.pytorch.org/executorch/main/backends-overview.html)** — Open-source acceleration for Apple, Qualcomm, ARM, MediaTek, Vulkan, and more.
+- **🎯 One Export, Multiple Backends** — Switch hardware targets with a single line change. Deploy the same model everywhere.
+
+## How It Works
+
+ExecuTorch uses **ahead-of-time (AOT) compilation** to prepare PyTorch models for edge deployment:
+
+1. **🧩 Export** — Capture your PyTorch model graph with `torch.export()`
+2. **⚙️ Compile** — Quantize, optimize, and partition to hardware backends → `.pte`
+3. **🚀 Execute** — Load `.pte` on-device via lightweight C++ runtime
+
+Models use a standardized [Core ATen operator set](https://docs.pytorch.org/executorch/main/compiler-ir-advanced.html#intermediate-representation). [Partitioners](https://docs.pytorch.org/executorch/main/compiler-delegate-and-partitioner.html) delegate subgraphs to specialized hardware (NPU/GPU) with CPU fallback.
+
+Learn more: [How ExecuTorch Works](https://docs.pytorch.org/executorch/main/intro-how-it-works.html) • [Architecture Guide](https://docs.pytorch.org/executorch/main/getting-started-architecture.html)
+
+## Quick Start
+
+### Installation
+
+```bash
+pip install executorch
+```
+
+For platform-specific setup (Android, iOS, embedded systems), see the [Quick Start](https://docs.pytorch.org/executorch/main/quick-start-section.html) documentation for additional info.
+
+### Export and Deploy in 3 Steps
+
+```python
+import torch
+from executorch.exir import to_edge_transform_and_lower
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# 1. Export your PyTorch model
+model = MyModel().eval()
+example_inputs = (torch.randn(1, 3, 224, 224),)
+exported_program = torch.export.export(model, example_inputs)
+
+# 2. Optimize for target hardware (switch backends with one line)
+program = to_edge_transform_and_lower(
+    exported_program,
+    partitioner=[XnnpackPartitioner()]  # CPU | CoreMLPartitioner() for iOS | QnnPartitioner() for Qualcomm
+).to_executorch()
+
+# 3. Save for deployment
+with open("model.pte", "wb") as f:
+    f.write(program.buffer)
+
+# Test locally via ExecuTorch runtime's pybind API (optional)
+from executorch.runtime import Runtime
+runtime = Runtime.get()
+method = runtime.load_program("model.pte").load_method("forward")
+outputs = method.execute([torch.randn(1, 3, 224, 224)])
+```
+
+### Run on Device
+
+**[C++](https://docs.pytorch.org/executorch/main/using-executorch-cpp.html)**
+```cpp
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+Module module("model.pte");
+auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
+auto outputs = module.forward({tensor});
+```
+
+**[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)**
+```swift
+let module = Module(filePath: "model.pte")
+let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0])
+let outputs: [Value] = try module.forward([input])
+```
+
+**[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)**
+```kotlin
+val module = Module.load("model.pte")
+val inputTensor = Tensor.fromBlob(floatArrayOf(1.0f, 2.0f, 3.0f, 4.0f), longArrayOf(2, 2))
+val outputs = module.forward(EValue.from(inputTensor))
+```
+
+### LLM Example: Llama
+
+Export Llama models using the [`export_llm`](https://docs.pytorch.org/executorch/main/llm/export-llm.html) script or [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch):
+
+```bash
+# Using export_llm
+python -m executorch.extension.llm.export.export_llm --model llama3_2 --output llama.pte
+
+# Using Optimum-ExecuTorch
+optimum-cli export executorch \
+  --model meta-llama/Llama-3.2-1B \
+  --task text-generation \
+  --recipe xnnpack \
+  --output_dir llama_model
+```
 
-It supports a wide range of models including LLMs (Large Language Models), CV (Computer Vision), ASR (Automatic Speech Recognition), and TTS (Text to Speech).
+Run on-device with the LLM runner API:
 
-Platform Support:
-- Operating Systems:
-  - iOS
-  - MacOS (ARM64)
-  - Android
-  - Linux
-  - Microcontrollers
+**[C++](https://docs.pytorch.org/executorch/main/llm/run-with-c-plus-plus.html)**
+```cpp
+#include <executorch/extension/llm/runner/text_llm_runner.h>
 
-- Hardware Acceleration:
-  - Apple
-  - Arm
-  - Cadence
-  - MediaTek
-  - NXP
-  - OpenVINO
-  - Qualcomm
-  - Vulkan
-  - XNNPACK
+auto runner = create_llama_runner("llama.pte", "tiktoken.bin");
+executorch::extension::llm::GenerationConfig config{
+    .seq_len = 128, .temperature = 0.8f};
+runner->generate("Hello, how are you?", config);
+```
 
-Key value propositions of ExecuTorch are:
+**[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)**
+```swift
+let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin")
+try runner.generate("Hello, how are you?", Config {
+    $0.sequenceLength = 128
+}) { token in
+    print(token, terminator: "")
+}
+```
 
-- **Portability:** Compatibility with a wide variety of computing platforms,
-  from high-end mobile phones to highly constrained embedded systems and
-  microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and Developer
-  Tools from PyTorch model authoring and conversion, to debugging and deployment
-  to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and high-performance
-  experience due to a lightweight runtime and utilizing full hardware
-  capabilities such as CPUs, NPUs, and DSPs.
+**Kotlin (Android)** — [API Docs](https://docs.pytorch.org/executorch/main/javadoc/org/pytorch/executorch/extension/llm/package-summary.html) • [Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo)
+```kotlin
+val llmModule = LlmModule("llama.pte", "tiktoken.bin", 0.8f)
+llmModule.load()
+llmModule.generate("Hello, how are you?", 128, object : LlmCallback {
+    override fun onResult(result: String) { print(result) }
+    override fun onStats(stats: String) { }
+})
+```
 
-## Getting Started
-To get started you can:
+For multimodal models (vision, audio), use the [MultiModal runner API](extension/llm/runner) which extends the LLM runner to handle image and audio inputs alongside text. See [Llava](examples/models/llava/README.md) and [Voxtral](examples/models/voxtral/README.md) examples.
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
-- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [Llava](examples/models/llava/README.md), [Voxtral](examples/models/voxtral/README.md), and [LFM2](examples/models/lfm2/README.md).
+See [examples/models/llama](examples/models/llama/README.md) for complete workflow including quantization, mobile deployment, and advanced options.
 
-## Feedback and Engagement
+**Next Steps:**
+- 📖 [Step-by-step tutorial](https://docs.pytorch.org/executorch/main/getting-started.html) — Complete walkthrough for your first model
+- ⚡ [Colab notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) — Try ExecuTorch instantly in your browser
+- 🤖 [Deploy Llama models](examples/models/llama/README.md) — LLM workflow with quantization and mobile demos
 
-We welcome any feedback, suggestions, and bug reports from the community to help
-us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/Dh43CKSAdc)
+## Platform & Hardware Support
 
-## Contributing
+| **Platform**     | **Supported Backends**                                   |
+|------------------|----------------------------------------------------------|
+| Android          | XNNPACK, Vulkan, Qualcomm, MediaTek, Samsung Exynos      |
+| iOS              | XNNPACK, MPS, CoreML (Neural Engine)                     |
+| Linux / Windows  | XNNPACK, OpenVINO, CUDA *(experimental)*                 |
+| macOS            | XNNPACK, MPS, Metal *(experimental)*                     |
+| Embedded / MCU   | XNNPACK, ARM Ethos-U, NXP, Cadence DSP                   |
 
-We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/Dh43CKSAdc)
+See [Backend Documentation](https://docs.pytorch.org/executorch/main/backends-overview.html) for detailed hardware requirements and optimization guides.
 
+## Production Deployments
 
-## Directory Structure
+ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devices, and partner deployments. [View success stories →](https://docs.pytorch.org/executorch/main/success-stories.html)
 
-Please refer to the [Codebase structure](CONTRIBUTING.md#codebase-structure) section of the [Contributing Guidelines](CONTRIBUTING.md) for more details.
+## Examples & Models
+
+**LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md)
+
+**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language)
+
+**Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3)
+
+**Resources:** [`examples/`](examples/) directory • [executorch-examples](https://github.com/meta-pytorch/executorch-examples) mobile demos • [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch) for HuggingFace models
+
+## Key Features
+
+ExecuTorch provides advanced capabilities for production deployment:
+
+- **Quantization** — Built-in support via [torchao](https://docs.pytorch.org/ao) for 8-bit, 4-bit, and dynamic quantization
+- **Memory Planning** — Optimize memory usage with ahead-of-time allocation strategies
+- **Developer Tools** — ETDump profiler, ETRecord inspector, and model debugger
+- **Selective Build** — Strip unused operators to minimize binary size
+- **Custom Operators** — Extend with domain-specific kernels
+- **Dynamic Shapes** — Support variable input sizes with bounded ranges
+
+See [Advanced Topics](https://docs.pytorch.org/executorch/main/advanced-topics-section.html) for quantization techniques, custom backends, and compiler passes.
+
+## Documentation
+
+- [**Documentation Home**](https://docs.pytorch.org/executorch/main/index.html) — Complete guides and tutorials
+- [**API Reference**](https://docs.pytorch.org/executorch/main/api-section.html) — Python, C++, Java/Kotlin APIs
+- [**Backend Integration**](https://docs.pytorch.org/executorch/main/backend-delegates-integration.html) — Build custom hardware backends
+- [**Troubleshooting**](https://docs.pytorch.org/executorch/main/using-executorch-troubleshooting.html) — Common issues and solutions
+
+## Community & Contributing
+
+We welcome contributions from the community!
+
+- 💬 [**GitHub Discussions**](https://github.com/pytorch/executorch/discussions) — Ask questions and share ideas
+- 🎮 [**Discord**](https://discord.gg/Dh43CKSAdc) — Chat with the team and community
+- 🐛 [**Issues**](https://github.com/pytorch/executorch/issues) — Report bugs or request features
+- 🤝 [**Contributing Guide**](CONTRIBUTING.md) — Guidelines and codebase structure
 
 ## License
-ExecuTorch is BSD licensed, as found in the LICENSE file.
+
+ExecuTorch is BSD licensed, as found in the [LICENSE](LICENSE) file.
+
+<br><br>
+
+---
+
+<div align="center">
+  <p><strong>Part of the PyTorch ecosystem</strong></p>
+  <p>
+    <a href="https://github.com/pytorch/executorch">GitHub</a> •
+    <a href="https://docs.pytorch.org/executorch">Documentation</a>
+  </p>
+</div>
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
new file mode 100644
index 00000000000..fcabb0a3f2b
--- /dev/null
+++ b/backends/aoti/CMakeLists.txt
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+# Use ExecuTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# Common AOTI functionality - combines all AOTI common components
+set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp)
+add_library(aoti_common STATIC ${_aoti_common_sources})
+target_include_directories(
+  aoti_common
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+         # PyTorch AOTI headers from ExecuTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(
+  aoti_common
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+target_link_options(
+  aoti_common PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
+
+# Link against ExecuTorch libraries and standard libraries
+target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
+executorch_target_link_options_shared_lib(aoti_common)
+
+install(
+  TARGETS aoti_common
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/backends/aoti/README.md b/backends/aoti/README.md
new file mode 100644
index 00000000000..74b45a35e5d
--- /dev/null
+++ b/backends/aoti/README.md
@@ -0,0 +1,28 @@
+# AOTI Common Library
+
+This directory contains **common library components** for AOTI (Ahead-of-Time Inference) driven backends in ExecutorTorch, **not a standalone backend**.
+
+## Purpose
+
+The code in this directory provides shared functionality and utilities that are used by actual AOTI-driven backends such as:
+
+- **CUDA backend** - Uses AOTI for GPU acceleration
+- Other AOTI-powered backends
+
+## Components
+
+- **`common_shims.cpp/h`** - Common shim functions that bridge ExecuTorch tensor operations with AOTI requirements
+- **`aoti_model_container.cpp/h`** - Model container functionality for AOTI models
+- **`utils.h`** - Utility functions and type definitions
+- **`tests/`** - Unit tests for the common functionality
+
+## Usage
+
+This library is intended to be used as a dependency by actual AOTI backend implementations. It is not a backend that can be used directly for model execution.
+
+For example backend implementations that use this common library, see:
+- `executorch/backends/cuda/` - CUDA AOTI backend
+
+## Building
+
+The common library components are built as part of the AOTI backend build process. See the `TARGETS` file for build configurations.
diff --git a/backends/aoti/TARGETS b/backends/aoti/TARGETS
new file mode 100644
index 00000000000..77871de4469
--- /dev/null
+++ b/backends/aoti/TARGETS
@@ -0,0 +1,3 @@
+load("targets.bzl", "define_common_targets")
+
+define_common_targets()
diff --git a/backends/aoti/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp
new file mode 100644
index 00000000000..46a246faeb8
--- /dev/null
+++ b/backends/aoti/aoti_model_container.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/aoti_model_container.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+extern "C" {
+
+// Global function pointers for AOT Inductor model container operations
+// These will be loaded dynamically from the shared library
+AOTInductorModelContainerCreateWithDeviceFunc
+    AOTInductorModelContainerCreateWithDevice = nullptr;
+AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr;
+AOTInductorModelContainerGetNumInputsFunc
+    AOTInductorModelContainerGetNumInputs = nullptr;
+AOTInductorModelContainerGetNumOutputsFunc
+    AOTInductorModelContainerGetNumOutputs = nullptr;
+AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
+
+// Additional global function pointers for AOT Inductor model container
+// operations needed by Metal backend
+AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName = nullptr;
+AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants = nullptr;
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
new file mode 100644
index 00000000000..877f019c457
--- /dev/null
+++ b/backends/aoti/aoti_model_container.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Type definitions
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+
+// Forward declarations for AOT Inductor model container
+struct AOTInductorModelContainerOpaque;
+using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*;
+using AOTInductorStreamHandle = void*;
+using AOTIProxyExecutorHandle = void*;
+
+// Function pointer types for AOT Inductor model container operations
+using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir);
+
+using AOTInductorModelContainerDeleteFunc =
+    AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle);
+
+using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_inputs);
+
+using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_outputs);
+
+using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    Tensor** input_handles, // array of input Tensor*; handles
+                            // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    Tensor** output_handles, // array for writing output Tensor*; handles
+                             // will be stolen by the caller; the array itself
+                             // is borrowed
+    size_t n_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Global function pointers (will be loaded dynamically)
+extern AOTInductorModelContainerCreateWithDeviceFunc
+    AOTInductorModelContainerCreateWithDevice;
+extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete;
+extern AOTInductorModelContainerGetNumInputsFunc
+    AOTInductorModelContainerGetNumInputs;
+extern AOTInductorModelContainerGetNumOutputsFunc
+    AOTInductorModelContainerGetNumOutputs;
+extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
+
+// Retrieves the name of an input tensor by index from the AOTI model container.
+// Needed by Metal backend
+using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** input_name);
+
+// Retrieves the number of constants from the AOTI model container.
+// Needed by Metal backend
+using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+// Global function pointers (will be loaded dynamically).
+// Needed by Metal backend
+extern AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName;
+extern AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants;
+
+} // extern "C"
+
+// AOTI Delegate Handle structure
+struct AOTIDelegateHandle {
+  void* so_handle;
+  std::string so_path;
+  AOTInductorModelContainerHandle container_handle;
+  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
+                     // dependency
+};
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
new file mode 100644
index 00000000000..1afd137aa26
--- /dev/null
+++ b/backends/aoti/common_shims.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/runtime/platform/log.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+namespace internal {
+// Global storage for tensor metadata
+std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+} // namespace internal
+
+extern "C" {
+
+// Autograd mode functions
+int32_t aoti_torch_grad_mode_is_enabled() {
+  // No autograd ever
+  return false;
+}
+
+void aoti_torch_grad_mode_set_enabled(bool enabled) {
+  if (enabled) {
+    throw std::runtime_error("Cannot enable autograd");
+  }
+}
+
+// Tensor attribute operations
+AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr) {
+  *ret_data_ptr = tensor->mutable_data_ptr();
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_storage_offset(
+    Tensor* tensor,
+    int64_t* ret_storage_offset) {
+  // Storage offset is always 0 in ET
+  *ret_storage_offset = 0;
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides) {
+  auto it = internal::tensor_to_strides.find(tensor);
+  bool needs_update = false;
+
+  if (it == internal::tensor_to_strides.end()) {
+    needs_update = true;
+  } else {
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
+    auto tensor_strides = tensor->strides();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_strides.begin(),
+        tensor_strides.end());
+  }
+
+  if (needs_update) {
+    std::vector<int64_t> strides(tensor->dim());
+    auto tensor_strides = tensor->strides();
+    for (int i = 0; i < tensor->dim(); i++) {
+      strides[i] = tensor_strides[i];
+    }
+    it =
+        internal::tensor_to_strides.insert_or_assign(tensor, std::move(strides))
+            .first;
+  }
+
+  // For 0D tensors, data() returns nullptr on empty vectors, but we need to
+  // return a valid pointer
+  if (it->second.empty()) {
+    static int64_t empty_strides_placeholder = 0;
+    *ret_strides = &empty_strides_placeholder;
+  } else {
+    *ret_strides = it->second.data();
+  }
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype) {
+  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes) {
+  auto it = internal::tensor_to_sizes.find(tensor);
+  bool needs_update = false;
+
+  if (it == internal::tensor_to_sizes.end()) {
+    needs_update = true;
+  } else {
+    // CRITICAL: Multimodal models reuse tensors with different shapes across
+    // executions (e.g., variable-length audio). We MUST validate cached
+    // metadata matches current tensor state, or CUDA kernels will receive
+    // incorrect shapes leading to memory corruption and segfaults.
+    auto tensor_sizes = tensor->sizes();
+    needs_update = !std::equal(
+        it->second.begin(),
+        it->second.end(),
+        tensor_sizes.begin(),
+        tensor_sizes.end());
+  }
+
+  if (needs_update) {
+    std::vector<int64_t> sizes(tensor->dim());
+    auto tensor_sizes = tensor->sizes();
+    for (int i = 0; i < tensor->dim(); i++) {
+      sizes[i] = tensor_sizes[i];
+    }
+    it = internal::tensor_to_sizes.insert_or_assign(tensor, std::move(sizes))
+             .first;
+  }
+
+  // For 0D tensors, data() returns nullptr on empty vectors, but we need to
+  // return a valid pointer
+  if (it->second.empty()) {
+    static int64_t empty_sizes_placeholder = 0;
+    *ret_sizes = &empty_sizes_placeholder;
+  } else {
+    *ret_sizes = it->second.data();
+  }
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_device_index(
+    Tensor* tensor,
+    int32_t* ret_device_index) {
+  // Let's assume all tensors AOTI using are on CUDA:0
+  *ret_device_index = 0;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim) {
+  *ret_dim = static_cast<int64_t>(tensor->dim());
+  return Error::Ok;
+}
+
+// Device and layout utility functions
+int32_t aoti_torch_device_type_cpu() {
+  // Let's say cpu is 0 for ET as well
+  return 0;
+}
+
+int32_t aoti_torch_layout_strided() {
+  // ET only support strided layout, the return value will always be 0, a.k.a
+  // at::Layout::Strided;
+  return 0;
+}
+
+// Dtype constants - these return the PyTorch dtype codes
+int32_t aoti_torch_dtype_float32() {
+  return 6; // PyTorch's float32 dtype code
+}
+
+int32_t aoti_torch_dtype_bfloat16() {
+  return 15; // PyTorch's bfloat16 dtype code
+}
+
+int32_t aoti_torch_dtype_int64() {
+  return 4; // PyTorch's int64 dtype code
+}
+
+// Dtype utility function needed by Metal backend.
+// Returns the size of the dtype in bytes.
+size_t aoti_torch_dtype_element_size(int32_t dtype) {
+  return dtype_to_element_size(dtype);
+}
+
+// Cleanup functions
+void cleanup_tensor_metadata() {
+  internal::tensor_to_sizes.clear();
+  internal::tensor_to_strides.clear();
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/common_shims.h b/backends/aoti/common_shims.h
new file mode 100644
index 00000000000..b79e4c86715
--- /dev/null
+++ b/backends/aoti/common_shims.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Common using declarations for ExecuTorch types
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Common AOTI type aliases
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
+// Global storage for tensor metadata
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
+// Attribute-related operations (memory-irrelevant)
+AOTITorchError aoti_torch_get_data_ptr(Tensor* tensor, void** ret_data_ptr);
+
+AOTITorchError aoti_torch_get_storage_offset(
+    Tensor* tensor,
+    int64_t* ret_storage_offset);
+
+AOTITorchError aoti_torch_get_strides(Tensor* tensor, int64_t** ret_strides);
+
+AOTITorchError aoti_torch_get_dtype(Tensor* tensor, int32_t* ret_dtype);
+
+AOTITorchError aoti_torch_get_sizes(Tensor* tensor, int64_t** ret_sizes);
+
+AOTITorchError aoti_torch_get_storage_size(Tensor* tensor, int64_t* ret_size);
+
+AOTITorchError aoti_torch_get_device_index(
+    Tensor* tensor,
+    int32_t* ret_device_index);
+
+AOTITorchError aoti_torch_get_dim(Tensor* tensor, int64_t* ret_dim);
+
+// Utility functions for device and layout information
+int32_t aoti_torch_device_type_cpu();
+int32_t aoti_torch_layout_strided();
+int32_t aoti_torch_dtype_float32();
+int32_t aoti_torch_dtype_bfloat16();
+int32_t aoti_torch_dtype_int64();
+
+// Dtype utility function needed by Metal backend
+size_t aoti_torch_dtype_element_size(int32_t dtype);
+
+// Autograd mode functions
+int32_t aoti_torch_grad_mode_is_enabled();
+void aoti_torch_grad_mode_set_enabled(bool enabled);
+
+// Cleanup functions for clearing global state
+void cleanup_tensor_metadata();
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
new file mode 100644
index 00000000000..8bf44573bb3
--- /dev/null
+++ b/backends/aoti/targets.bzl
@@ -0,0 +1,58 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    # AOTI common shims functionality
+    runtime.cxx_library(
+        name = "common_shims",
+        srcs = [
+            "common_shims.cpp",
+        ],
+        headers = [
+            "common_shims.h",
+            "utils.h",
+        ],
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        # Constructor needed for backend registration.
+        compiler_flags = ["-Wno-global-constructors"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
+        ],
+    )
+
+    # AOTI model container functionality
+    runtime.cxx_library(
+        name = "model_container",
+        srcs = [
+            "aoti_model_container.cpp",
+        ],
+        headers = [
+            "aoti_model_container.h",
+        ],
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        # Constructor needed for backend registration.
+        compiler_flags = ["-Wno-global-constructors"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        deps = [
+            "//executorch/runtime/backend:interface",
+            "//executorch/runtime/core:core",
+        ],
+    )
+
+    # Common AOTI functionality (combining both common_shims and model_container)
+    runtime.cxx_library(
+        name = "aoti_common",
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        exported_deps = [
+            ":common_shims",
+            ":model_container",
+        ],
+    )
diff --git a/backends/aoti/tests/TARGETS b/backends/aoti/tests/TARGETS
new file mode 100644
index 00000000000..8daa8abd4d7
--- /dev/null
+++ b/backends/aoti/tests/TARGETS
@@ -0,0 +1,22 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+
+oncall("executorch")
+
+cpp_unittest(
+    name = "test_common_shims",
+    srcs = [
+        "test_common_shims.cpp",
+    ],
+    headers = [
+        "utils.h",
+    ],
+    deps = [
+        "//executorch/backends/aoti:common_shims",
+        "//executorch/extension/tensor:tensor",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/platform:platform",
+        "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/extension/tensor:tensor",
+    ],
+)
diff --git a/backends/aoti/tests/test_common_shims.cpp b/backends/aoti/tests/test_common_shims.cpp
new file mode 100644
index 00000000000..980eae96122
--- /dev/null
+++ b/backends/aoti/tests/test_common_shims.cpp
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/tests/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <vector>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::aoti::test;
+using namespace executorch::runtime;
+using executorch::runtime::etensor::Tensor;
+
+// Test fixture for common shims tests
+class CommonShimsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Clean up any existing cached metadata before each test
+    cleanup_tensor_metadata();
+  }
+
+  void TearDown() override {
+    // Clean up metadata and free any tensor data
+    cleanup_tensor_metadata();
+    for (auto& tensor : test_tensors_) {
+      free_tensor_data(tensor.get());
+    }
+    test_tensors_.clear();
+  }
+
+  // Helper to create and track test tensors for cleanup
+  Tensor* create_tracked_tensor(const std::vector<int64_t>& sizes) {
+    auto tensor = create_test_tensor(sizes);
+    Tensor* ptr = tensor.get();
+    test_tensors_.push_back(tensor);
+    return ptr;
+  }
+
+ private:
+  std::vector<std::shared_ptr<Tensor>> test_tensors_;
+};
+
+// Test aoti_torch_get_sizes basic functionality
+TEST_F(CommonShimsTest, GetSizesBasicFunctionality) {
+  // Test 1D tensor
+  auto tensor_1d = create_tracked_tensor({5});
+  int64_t* sizes_ptr;
+  AOTITorchError error = aoti_torch_get_sizes(tensor_1d, &sizes_ptr);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(sizes_ptr, nullptr);
+  EXPECT_EQ(sizes_ptr[0], 5);
+
+  // Test 2D tensor
+  auto tensor_2d = create_tracked_tensor({3, 4});
+  error = aoti_torch_get_sizes(tensor_2d, &sizes_ptr);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(sizes_ptr, nullptr);
+  EXPECT_EQ(sizes_ptr[0], 3);
+  EXPECT_EQ(sizes_ptr[1], 4);
+
+  // Test 3D tensor
+  auto tensor_3d = create_tracked_tensor({2, 3, 4});
+  error = aoti_torch_get_sizes(tensor_3d, &sizes_ptr);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(sizes_ptr, nullptr);
+  EXPECT_EQ(sizes_ptr[0], 2);
+  EXPECT_EQ(sizes_ptr[1], 3);
+  EXPECT_EQ(sizes_ptr[2], 4);
+}
+
+// Test aoti_torch_get_strides basic functionality
+TEST_F(CommonShimsTest, GetStridesBasicFunctionality) {
+  // Test 1D tensor
+  auto tensor_1d = create_tracked_tensor({5});
+  int64_t* strides_ptr;
+  AOTITorchError error = aoti_torch_get_strides(tensor_1d, &strides_ptr);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(strides_ptr, nullptr);
+  EXPECT_EQ(strides_ptr[0], 1);
+
+  // Test 2D tensor - row major: [3, 4] should have strides [4, 1]
+  auto tensor_2d = create_tracked_tensor({3, 4});
+  error = aoti_torch_get_strides(tensor_2d, &strides_ptr);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(strides_ptr, nullptr);
+  EXPECT_EQ(strides_ptr[0], 4);
+  EXPECT_EQ(strides_ptr[1], 1);
+
+  // Test 3D tensor - row major: [2, 3, 4] should have strides [12, 4, 1]
+  auto tensor_3d = create_tracked_tensor({2, 3, 4});
+  error = aoti_torch_get_strides(tensor_3d, &strides_ptr);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(strides_ptr, nullptr);
+  EXPECT_EQ(strides_ptr[0], 12);
+  EXPECT_EQ(strides_ptr[1], 4);
+  EXPECT_EQ(strides_ptr[2], 1);
+}
+
+// Test caching logic for sizes
+TEST_F(CommonShimsTest, SizesCachingLogic) {
+  auto tensor = create_tracked_tensor({2, 3, 4});
+
+  // First call should cache the sizes
+  int64_t* sizes_ptr1;
+  AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr1);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(sizes_ptr1, nullptr);
+
+  // Second call should return the same cached pointer
+  int64_t* sizes_ptr2;
+  error = aoti_torch_get_sizes(tensor, &sizes_ptr2);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(sizes_ptr1, sizes_ptr2); // Should be the exact same pointer
+
+  // Values should still be correct
+  EXPECT_EQ(sizes_ptr2[0], 2);
+  EXPECT_EQ(sizes_ptr2[1], 3);
+  EXPECT_EQ(sizes_ptr2[2], 4);
+}
+
+// Test caching logic for strides
+TEST_F(CommonShimsTest, StridesCachingLogic) {
+  auto tensor = create_tracked_tensor({2, 3, 4});
+
+  // First call should cache the strides
+  int64_t* strides_ptr1;
+  AOTITorchError error = aoti_torch_get_strides(tensor, &strides_ptr1);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(strides_ptr1, nullptr);
+
+  // Second call should return the same cached pointer
+  int64_t* strides_ptr2;
+  error = aoti_torch_get_strides(tensor, &strides_ptr2);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(strides_ptr1, strides_ptr2); // Should be the exact same pointer
+
+  // Values should still be correct
+  EXPECT_EQ(strides_ptr2[0], 12);
+  EXPECT_EQ(strides_ptr2[1], 4);
+  EXPECT_EQ(strides_ptr2[2], 1);
+}
+
+// Test that different tensors have different cached entries
+TEST_F(CommonShimsTest, DifferentTensorsCacheSeparately) {
+  auto tensor1 = create_tracked_tensor({2, 3});
+  auto tensor2 = create_tracked_tensor({4, 5});
+
+  // Get sizes for both tensors
+  int64_t* sizes1_ptr;
+  int64_t* sizes2_ptr;
+
+  EXPECT_EQ(aoti_torch_get_sizes(tensor1, &sizes1_ptr), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_sizes(tensor2, &sizes2_ptr), Error::Ok);
+
+  // Pointers should be different (different cache entries)
+  EXPECT_NE(sizes1_ptr, sizes2_ptr);
+
+  // Values should be correct
+  EXPECT_EQ(sizes1_ptr[0], 2);
+  EXPECT_EQ(sizes1_ptr[1], 3);
+  EXPECT_EQ(sizes2_ptr[0], 4);
+  EXPECT_EQ(sizes2_ptr[1], 5);
+
+  // Test strides as well
+  int64_t* strides1_ptr;
+  int64_t* strides2_ptr;
+
+  EXPECT_EQ(aoti_torch_get_strides(tensor1, &strides1_ptr), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor2, &strides2_ptr), Error::Ok);
+
+  // Pointers should be different (different cache entries)
+  EXPECT_NE(strides1_ptr, strides2_ptr);
+
+  // Values should be correct
+  EXPECT_EQ(strides1_ptr[0], 3);
+  EXPECT_EQ(strides1_ptr[1], 1);
+  EXPECT_EQ(strides2_ptr[0], 5);
+  EXPECT_EQ(strides2_ptr[1], 1);
+}
+
+// Test cache persistence across multiple calls
+TEST_F(CommonShimsTest, CachePersistence) {
+  auto tensor = create_tracked_tensor({3, 4, 5});
+
+  // Multiple calls to sizes should all return the same pointer
+  int64_t* sizes_ptr1;
+  int64_t* sizes_ptr2;
+  int64_t* sizes_ptr3;
+
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr3), Error::Ok);
+
+  EXPECT_EQ(sizes_ptr1, sizes_ptr2);
+  EXPECT_EQ(sizes_ptr2, sizes_ptr3);
+
+  // Multiple calls to strides should all return the same pointer
+  int64_t* strides_ptr1;
+  int64_t* strides_ptr2;
+  int64_t* strides_ptr3;
+
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr3), Error::Ok);
+
+  EXPECT_EQ(strides_ptr1, strides_ptr2);
+  EXPECT_EQ(strides_ptr2, strides_ptr3);
+}
+
+// Test 0D tensor (scalar)
+TEST_F(CommonShimsTest, ScalarTensor) {
+  auto tensor_0d = create_tracked_tensor({});
+
+  // Test sizes for 0D tensor
+  int64_t* sizes_ptr;
+  AOTITorchError error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(sizes_ptr, nullptr);
+
+  // Test strides for 0D tensor
+  int64_t* strides_ptr;
+  error = aoti_torch_get_strides(tensor_0d, &strides_ptr);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(strides_ptr, nullptr);
+
+  // Cache should work for 0D tensors too
+  int64_t* sizes_ptr2;
+  error = aoti_torch_get_sizes(tensor_0d, &sizes_ptr2);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(sizes_ptr, sizes_ptr2);
+}
+
+// Test large tensor dimensions
+TEST_F(CommonShimsTest, LargeTensorDimensions) {
+  auto tensor = create_tracked_tensor({100, 200, 300, 400});
+
+  // Test sizes
+  int64_t* sizes_ptr;
+  AOTITorchError error = aoti_torch_get_sizes(tensor, &sizes_ptr);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(sizes_ptr, nullptr);
+  EXPECT_EQ(sizes_ptr[0], 100);
+  EXPECT_EQ(sizes_ptr[1], 200);
+  EXPECT_EQ(sizes_ptr[2], 300);
+  EXPECT_EQ(sizes_ptr[3], 400);
+
+  // Test strides - expected: [24000000, 120000, 400, 1]
+  int64_t* strides_ptr;
+  error = aoti_torch_get_strides(tensor, &strides_ptr);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(strides_ptr, nullptr);
+  EXPECT_EQ(strides_ptr[0], 24000000);
+  EXPECT_EQ(strides_ptr[1], 120000);
+  EXPECT_EQ(strides_ptr[2], 400);
+  EXPECT_EQ(strides_ptr[3], 1);
+}
+
+// Test that cleanup_tensor_metadata clears the cache
+TEST_F(CommonShimsTest, CleanupFunctionality) {
+  auto tensor = create_tracked_tensor({2, 3});
+
+  // Cache some data
+  int64_t* sizes_ptr1;
+  int64_t* strides_ptr1;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
+
+  // Clear the cache
+  cleanup_tensor_metadata();
+
+  // Getting sizes/strides again should create new cache entries
+  // (We can't directly test if the pointers are different since that would be
+  // implementation-dependent, but we can at least verify the functions still
+  // work)
+  int64_t* sizes_ptr2;
+  int64_t* strides_ptr2;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
+
+  // Values should still be correct
+  EXPECT_EQ(sizes_ptr2[0], 2);
+  EXPECT_EQ(sizes_ptr2[1], 3);
+  EXPECT_EQ(strides_ptr2[0], 3);
+  EXPECT_EQ(strides_ptr2[1], 1);
+}
+
+// Test mixed operations to ensure caches are independent
+TEST_F(CommonShimsTest, IndependentCaches) {
+  auto tensor = create_tracked_tensor({2, 3, 4});
+
+  // Get sizes first
+  int64_t* sizes_ptr1;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr1), Error::Ok);
+
+  // Get strides
+  int64_t* strides_ptr1;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr1), Error::Ok);
+
+  // Get sizes again - should be cached
+  int64_t* sizes_ptr2;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr2), Error::Ok);
+  EXPECT_EQ(sizes_ptr1, sizes_ptr2);
+
+  // Get strides again - should be cached
+  int64_t* strides_ptr2;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr2), Error::Ok);
+  EXPECT_EQ(strides_ptr1, strides_ptr2);
+
+  // Sizes and strides pointers should be different (different caches)
+  EXPECT_NE(sizes_ptr1, strides_ptr1);
+}
diff --git a/backends/aoti/tests/utils.h b/backends/aoti/tests/utils.h
new file mode 100644
index 00000000000..1f26f7e2d51
--- /dev/null
+++ b/backends/aoti/tests/utils.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/result.h>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+namespace test {
+
+// Use the same type aliases as in common_shims.h
+using executorch::runtime::etensor::Tensor;
+
+/**
+ * Creates a test tensor with the specified shape and scalar type
+ */
+inline std::shared_ptr<Tensor> create_test_tensor(
+    const std::vector<int64_t>& sizes,
+    exec_aten::ScalarType dtype = exec_aten::ScalarType::Float) {
+  // Calculate total number of elements
+  int64_t total_elements = 1;
+  for (int64_t size : sizes) {
+    total_elements *= size;
+  }
+
+  // Calculate strides (row-major layout)
+  std::vector<int64_t> strides(sizes.size());
+  if (sizes.size() > 0) {
+    strides[sizes.size() - 1] = 1;
+    for (int i = sizes.size() - 2; i >= 0; i--) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+  }
+
+  // Allocate data buffer
+  size_t dtype_size = exec_aten::elementSize(dtype);
+  void* data = malloc(total_elements * dtype_size);
+
+  // Convert sizes and strides to the required type
+  std::vector<executorch::aten::SizesType> sizes_converted(
+      sizes.begin(), sizes.end());
+  std::vector<executorch::aten::SizesType> strides_converted(
+      strides.begin(), strides.end());
+
+  // Create the tensor with the correct argument types and count
+  auto tensor = executorch::extension::from_blob(
+      data, sizes_converted, strides_converted, dtype);
+
+  return tensor;
+}
+
+/**
+ * Helper to clean up tensor data that was allocated with malloc
+ */
+inline void free_tensor_data(Tensor* tensor) {
+  if (tensor && tensor->mutable_data_ptr()) {
+    free(tensor->mutable_data_ptr());
+  }
+}
+
+} // namespace test
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
new file mode 100644
index 00000000000..78c07bcea6e
--- /dev/null
+++ b/backends/aoti/utils.h
@@ -0,0 +1,99 @@
+
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Common using declarations for ExecuTorch types
+using executorch::runtime::Error;
+
+extern "C" {
+
+// Common AOTI type aliases
+using AOTITorchError = Error;
+
+// Map int32_t dtype to ExecuTorch ScalarType (robust version of hardcoded
+// ScalarType::Float)
+inline executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
+  // Convert based on known PyTorch dtype codes (without CUDA-specific
+  // dependency)
+  switch (dtype) {
+    case 4: // PyTorch's int64 dtype code
+      return executorch::aten::ScalarType::Long;
+    case 6: // PyTorch's float32 dtype code
+      return executorch::aten::ScalarType::Float;
+    case 15: // PyTorch's bfloat16 dtype code
+      return executorch::aten::ScalarType::BFloat16;
+    // Future support for additional dtypes can be added here
+    default:
+      ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype);
+      return executorch::aten::ScalarType::Undefined;
+  }
+}
+
+// Map int32_t dtype to number of bytes per element (reusing ExecuTorch's
+// elementSize function)
+inline size_t dtype_to_element_size(int32_t dtype) {
+  // First convert int32_t dtype to ExecuTorch ScalarType, then use existing
+  // elementSize function
+  executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype);
+  if (scalar_type == executorch::aten::ScalarType::Undefined) {
+    ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype);
+    return 0; // Return 0 to indicate error
+  }
+
+  // Reuse ExecuTorch's existing elementSize function from scalar_type_util.h
+  return executorch::runtime::elementSize(scalar_type);
+}
+
+// Storage offset validation utility function
+inline AOTITorchError validate_storage_offset(int64_t storage_offset) {
+  // Storage offset must always be 0
+  if (storage_offset != 0) {
+    ET_LOG(
+        Error,
+        "Storage offset must be 0. Got storage_offset: %ld",
+        storage_offset);
+    return Error::InvalidArgument;
+  }
+  return Error::Ok;
+}
+
+// Check if tensor is in contiguous memory format (NCHW for 4D tensors)
+// Contiguous format means strides decrease from left to right:
+// For NCHW: strides = [C*H*W, H*W, W, 1]
+inline bool is_tensor_contiguous(
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides) {
+  int64_t expected_stride = 1;
+  for (int64_t i = ndim - 1; i >= 0; i--) {
+    if (strides[i] != expected_stride) {
+      return false;
+    }
+    expected_stride *= sizes[i];
+  }
+  return true;
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/apple/coreml/CMakeLists.txt b/backends/apple/coreml/CMakeLists.txt
index 9879a05e3dc..17e2d94e336 100644
--- a/backends/apple/coreml/CMakeLists.txt
+++ b/backends/apple/coreml/CMakeLists.txt
@@ -115,7 +115,7 @@ if(APPLE)
 endif()
 target_compile_options(coreml_util PUBLIC -fPIC)
 
-install(TARGETS coreml_util DESTINATION lib)
+install(TARGETS coreml_util DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 install(
   DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/runtime/util
@@ -154,7 +154,7 @@ target_compile_options(coreml_inmemoryfs PUBLIC -fPIC)
 
 install(
   TARGETS coreml_inmemoryfs
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
@@ -251,7 +251,7 @@ if(APPLE)
   install(
     TARGETS coremldelegate coreml_util coreml_inmemoryfs
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
     INCLUDES
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
   )
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index d1614f30451..16ace2e7a88 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -6,6 +6,7 @@
 import logging
 
 import shutil
+import tempfile
 import uuid
 from dataclasses import asdict, dataclass
 from enum import Enum
@@ -415,7 +416,7 @@ def preprocess_model(
         mlmodel: ct.models.MLModel, model_type: MODEL_TYPE
     ) -> PreprocessResult:
         identifier = "executorch_" + str(uuid.uuid4())
-        dir_path: Path = Path("tmp") / identifier
+        dir_path: Path = Path(tempfile.gettempdir()) / identifier
         model_dir_path: Path = dir_path / "lowered_module"
         model_spec: ct.proto.Model_pb2 = mlmodel.get_spec()
         logger.warning(
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h
index a9e06efa90d..11d957044e9 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.h
@@ -99,17 +99,6 @@ NS_ASSUME_NONNULL_BEGIN
 - (NSUInteger)compact:(NSUInteger)sizeInBytes error:(NSError* __autoreleasing*)error;
 
 
-/// Executes a block with a unique temporary directory.
-///
-/// A new temporary subdirectory URL is created inside the receiver’s designated
-/// base directory. The directory is passed to the block, which can use it to
-/// perform temporary file operations. After the block finishes executing,
-/// the directory and its contents are removed.
-///
-/// @param block A block to execute. The block receives a unique URL.
-- (void)withTemporaryDirectory:(void (^)(NSURL* directoryURL))block;
-
-
 /// Purges the assets storage. The assets are moved to the trash directory and are asynchronously
 /// deleted.
 ///
@@ -128,12 +117,6 @@ NS_ASSUME_NONNULL_BEGIN
 /// contents are deleted asynchronously.
 @property (copy, readonly, nonatomic) NSURL* trashDirectoryURL;
 
-
-/// The staging directory URL, used to hold assets that are being prepared or processed
-/// before they are moved into their final location. The contents of this directory
-/// are temporary and may be cleared when no longer needed.
-@property (copy, readonly, nonatomic) NSURL* stagingDirectoryURL;
-
 /// The file manager.
 @property (strong, readonly, nonatomic) NSFileManager* fileManager;
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
index 53c3d1cdc69..256026e1f09 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLAssetManager.mm
@@ -254,29 +254,6 @@ BOOL is_asset_alive(NSMapTable<NSString *, ETCoreMLAsset *> *assets_in_use_map,
     
     return assets;
 }
-
-NSURL * _Nullable move_to_directory(NSURL *url,
-                                    NSURL *directoryURL,
-                                    NSFileManager *fileManager,
-                                    NSError * __autoreleasing *error) {
-    if (!url) {
-        ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorInternalError, "Move operation failed: source URL is nil.");
-        return nil;
-    }
-
-    if (!directoryURL) {
-        ETCoreMLLogErrorAndSetNSError(error, ETCoreMLErrorInternalError, "Move operation failed: destination URL is nil.");
-        return nil;
-    }
-
-    NSURL *dstURL = [directoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
-    if (![fileManager moveItemAtURL:url toURL:dstURL error:error]) {
-        return nil;
-    }
-
-    return dstURL;
-}
-
 } //namespace
 
 @interface ETCoreMLAssetManager () <NSFileManagerDelegate> {
@@ -322,17 +299,12 @@ - (nullable instancetype)initWithDatabase:(const std::shared_ptr<Database>&)data
     if (!managedAssetsDirectoryURL) {
         return nil;
     }
-
+    
     NSURL *managedTrashDirectoryURL = ::create_directory_if_needed(trashDirectoryURL, @"models", fileManager, error);
     if (!managedTrashDirectoryURL) {
         return nil;
     }
-
-    NSURL *managedStagingDirectoryURL = ::create_directory_if_needed(assetsDirectoryURL, @"staging", fileManager, error);
-    if (!managedStagingDirectoryURL) {
-        return nil;
-    }
-
+    
     // If directory is empty then purge the stores
     if (::is_directory_empty(managedAssetsDirectoryURL, fileManager, nil)) {
         assetsMetaStore.impl()->purge(ec);
@@ -343,7 +315,6 @@ - (nullable instancetype)initWithDatabase:(const std::shared_ptr<Database>&)data
         _assetsStore = std::move(assetsStore);
         _assetsMetaStore = std::move(assetsMetaStore);
         _assetsDirectoryURL = managedAssetsDirectoryURL;
-        _stagingDirectoryURL = managedStagingDirectoryURL;
         _trashDirectoryURL = managedTrashDirectoryURL;
         _estimatedSizeInBytes = sizeInBytes.value();
         _maxAssetsSizeInBytes = maxAssetsSizeInBytes;
@@ -375,15 +346,15 @@ - (nullable instancetype)initWithDatabaseURL:(NSURL *)databaseURL
                             error:error];
 }
 
-- (void)withTemporaryDirectory:(void (^)(NSURL *directoryURL))block {
-    NSURL *dstURL = [self.stagingDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
-    block(dstURL);
-    if (![self.fileManager fileExistsAtPath:dstURL.path]) {
-        return;
+- (nullable NSURL *)moveURL:(NSURL *)url
+     toUniqueURLInDirectory:(NSURL *)directoryURL
+                      error:(NSError * __autoreleasing *)error {
+    NSURL *dstURL = [directoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
+    if (![self.fileManager moveItemAtURL:url toURL:dstURL error:error]) {
+        return nil;
     }
-
-    move_to_directory(dstURL, self.trashDirectoryURL, self.fileManager, nil);
-    [self cleanupTrashDirectory];
+    
+    return dstURL;
 }
 
 - (void)cleanupAssetIfNeeded:(ETCoreMLAsset *)asset {
@@ -436,8 +407,9 @@ - (nullable ETCoreMLAsset *)_storeAssetAtURL:(NSURL *)srcURL
             return false;
         }
         
-        // If a file already exists at `dstURL`, move it to the trash for removal.
-        move_to_directory(dstURL, self.trashDirectoryURL, self.fileManager, nil);
+        // If an asset exists move it
+        [self moveURL:dstURL toUniqueURLInDirectory:self.trashDirectoryURL error:nil];
+        
         // Move the asset to assets directory.
         if (![self.fileManager moveItemAtURL:srcURL toURL:dstURL error:error]) {
             return false;
@@ -461,25 +433,16 @@ - (nullable ETCoreMLAsset *)_storeAssetAtURL:(NSURL *)srcURL
 }
 
 - (void)triggerCompaction {
-    if (self.estimatedSizeInBytes >= self.maxAssetsSizeInBytes) {
-        __weak __typeof(self) weakSelf = self;
-        dispatch_async(self.syncQueue, ^{
-            NSError *localError = nil;
-            if (![weakSelf _compact:self.maxAssetsSizeInBytes error:&localError]) {
-                ETCoreMLLogError(localError, "Failed to compact asset store.");
-            }
-        });
+    if (self.estimatedSizeInBytes < self.maxAssetsSizeInBytes) {
+        return;
     }
-
-    // Always clean the trash directory to ensure a minimal footprint.
-    // The `trashQueue` is serialized, so only one cleanup will run at a time.
-    [self cleanupTrashDirectory];
-}
-
-- (void)cleanupTrashDirectory {
+    
     __weak __typeof(self) weakSelf = self;
-    dispatch_async(self.trashQueue, ^{
-        [weakSelf removeFilesInTrashDirectory];
+    dispatch_async(self.syncQueue, ^{
+        NSError *localError = nil;
+        if (![weakSelf _compact:self.maxAssetsSizeInBytes error:&localError]) {
+            ETCoreMLLogError(localError, "Failed to compact asset store.");
+        }
     });
 }
 
@@ -585,7 +548,7 @@ - (BOOL)_removeAssetWithIdentifier:(NSString *)identifier
         
         NSURL *assetURL = ::get_asset_url(assetValue);
         if ([self.fileManager fileExistsAtPath:assetURL.path] &&
-            !move_to_directory(assetURL, self.trashDirectoryURL, self.fileManager, error)) {
+            ![self moveURL:assetURL toUniqueURLInDirectory:self.trashDirectoryURL error:error]) {
             return false;
         }
         
@@ -686,7 +649,13 @@ - (NSUInteger)_compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing
                              identifier);
         }
     }
-
+    
+    // Trigger cleanup.
+    __weak __typeof(self) weakSelf = self;
+    dispatch_async(self.trashQueue, ^{
+        [weakSelf removeFilesInTrashDirectory];
+    });
+    
     return _estimatedSizeInBytes;
 }
 
@@ -695,10 +664,7 @@ - (NSUInteger)compact:(NSUInteger)sizeInBytes error:(NSError * __autoreleasing *
     dispatch_sync(self.syncQueue, ^{
         result = [self _compact:sizeInBytes error:error];
     });
-
-    // Always clean the trash directory to ensure a minimal footprint.
-    // The `trashQueue` is serialized, so only one cleanup will run at a time.
-    [self cleanupTrashDirectory];
+    
     return result;
 }
 
@@ -742,7 +708,7 @@ - (BOOL)_purge:(NSError * __autoreleasing *)error {
         }
         
         // Move the the whole assets directory to the temp directory.
-        if (!move_to_directory(self.assetsDirectoryURL, self.trashDirectoryURL, self.fileManager, error)) {
+        if (![self moveURL:self.assetsDirectoryURL toUniqueURLInDirectory:self.trashDirectoryURL error:error]) {
             return false;
         }
         
@@ -758,7 +724,13 @@ - (BOOL)_purge:(NSError * __autoreleasing *)error {
     
     ::set_error_from_error_code(ec, error);
     // Trigger cleanup
-    [self cleanupTrashDirectory];
+    if (status) {
+        __weak __typeof(self) weakSelf = self;
+        dispatch_async(self.trashQueue, ^{
+            [weakSelf removeFilesInTrashDirectory];
+        });
+    }
+    
     return static_cast<BOOL>(status);
 }
 
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm
index 9e8ae04842e..05aa910d954 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelLoader.mm
@@ -62,12 +62,21 @@ + (nullable ETCoreMLModel *)loadModelWithContentsOfURL:(NSURL *)compiledModelURL
     if (model) {
         return model;
     }
-
-    if (error) {
-        *error = localError;
+    
+    if (localError) {
+        ETCoreMLLogError(localError,
+                         "Failed to load model from compiled asset with identifier = %@",
+                         identifier);
     }
-
-    return nil;
+    
+    // If store failed then we will load the model from compiledURL.
+    auto backingAsset = Asset::make(compiledModelURL, identifier, assetManager.fileManager, error);
+    if (!backingAsset) {
+        return nil;
+    }
+    
+    asset = [[ETCoreMLAsset alloc] initWithBackingAsset:backingAsset.value()];
+    return ::get_model_from_asset(asset, configuration, metadata, error);
 }
 
 @end
diff --git a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
index 524ceaf7e28..2347936fd34 100644
--- a/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
+++ b/backends/apple/coreml/runtime/delegate/ETCoreMLModelManager.mm
@@ -345,10 +345,6 @@ void add_compute_unit(std::string& identifier, MLComputeUnits compute_units) {
     return [ETCoreMLModelDebugInfo modelDebugInfoFromData:file_data error:error];
 }
 
-NSString *raw_model_identifier(NSString *identifier) {
-    return [NSString stringWithFormat:@"raw_%@", identifier];
-}
-
 #endif
 } //namespace
 
@@ -412,7 +408,7 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier {
         return modelAsset;
     }
     
-    __block NSError *localError = nil;
+    NSError *localError = nil;
     modelAsset = [self.assetManager assetWithIdentifier:identifier error:&localError];
     if (localError) {
         ETCoreMLLogError(localError,
@@ -424,9 +420,8 @@ - (nullable ETCoreMLAsset *)assetWithIdentifier:(NSString *)identifier {
 }
 
 - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
-                                          modelURL:(nullable NSURL *)modelURL
                                         inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS
-                                            dstURL:(NSURL *)dstURL
+                                      assetManager:(ETCoreMLAssetManager *)assetManager
                                              error:(NSError * __autoreleasing *)error {
     auto modelAssetType = get_model_asset_type(inMemoryFS);
     if (!modelAssetType) {
@@ -435,135 +430,80 @@ - (nullable NSURL *)compiledModelURLWithIdentifier:(NSString *)identifier
                                       "AOT blob is missing model file.");
         return nil;
     }
-
-    // If modelURL is not provided, write model files to the destination directory (dstURL)
-    // and obtain a URL pointing to them. Otherwise, use the provided modelURL.
-    modelURL = (modelURL == nil) ? ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error) : modelURL;
-    if (!modelURL) {
-        // Failed to generate or locate model files, return nil.
-        return nil;
-    }
-
-    // Handle based on the type of the model asset.
+    
+    NSURL *dstURL = [self.assetManager.trashDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
+    NSURL *modelURL = ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error);
     switch (modelAssetType.value()) {
         case ModelAssetType::CompiledModel: {
-            // The model is already compiled; no further action needed.
-            // Return the existing model URL.
+            // Model is already compiled.
             ETCoreMLLogInfo("The model in the pte file is pre-compiled.  Skipping compilation.");
             return modelURL;
         }
-
+            
         case ModelAssetType::Model: {
-            // The model is not compiled yet.
-            // Compile the model at the specified URL with a maximum wait time of 5 minutes.
+            // Compile the model.
             ETCoreMLLogInfo("The model in the pte file is not pre-compiled.  Compiling with a 5 min timeout.");
             NSURL *compiledModelURL = [ETCoreMLModelCompiler compileModelAtURL:modelURL
                                                           maxWaitTimeInSeconds:(5 * 60)
                                                                          error:error];
-            // Return the URL of the compiled model or nil if compilation fails.
+            
             return compiledModelURL;
         }
     }
 }
 
-- (nullable ETCoreMLAsset *)compiledModelAssetWithMetadata:(const ModelMetadata&)metadata
-                                                  modelURL:(nullable NSURL *)modelURL
-                                                inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS
-                                                     error:(NSError * __autoreleasing *)error {
-    NSString *identifier = @(metadata.identifier.c_str());
-    __block ETCoreMLAsset *compiledModelAsset = [self assetWithIdentifier:identifier];
-    if (compiledModelAsset) {
-        ETCoreMLLogInfo("Cache Hit: Successfully retrieved compiled model with identifier=%@ from the models cache.", identifier);
-    } else {
-        ETCoreMLLogInfo("Cache Miss: Compiled Model with identifier=%@ was not found in the models cache.", identifier);
-    }
-
-    [self.assetManager withTemporaryDirectory:^(NSURL * _Nonnull directoryURL) {
-        if (compiledModelAsset) {
-            return;
-        }
-
-        // The directory specified by `directoryURL` is unique and will be automatically cleaned up
-        // once the enclosing block completes.
-        NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier
-                                                              modelURL:modelURL
-                                                            inMemoryFS:inMemoryFS
-                                                                dstURL:directoryURL
-                                                                 error:error];
-        if (compiledModelURL) {
-            // Move the compiled model to the asset manager to transfer ownership.
-            ETCoreMLLogInfo("Storing compiled asset with identifier=%@ in the asset manager.", identifier);
-            compiledModelAsset = [self.assetManager storeAssetAtURL:compiledModelURL withIdentifier:identifier error:error];
-        }
-    }];
-
-    return compiledModelAsset;
-}
-
 #if ET_EVENT_TRACER_ENABLED
-- (nullable ETCoreMLAsset *)modelAssetWithMetadata:(const ModelMetadata&)metadata
-                                        inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS
-                                             error:(NSError * __autoreleasing *)error {
+- (nullable id<ETCoreMLModelExecutor>)modelExecutorWithMetadata:(const ModelMetadata&)metadata
+                                                     inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS
+                                                  configuration:(MLModelConfiguration *)configuration
+                                                          error:(NSError * __autoreleasing *)error {
     NSString *identifier = @(metadata.identifier.c_str());
-    NSString *rawIdentifier = raw_model_identifier(identifier);
-    __block ETCoreMLAsset *modelAsset = [self assetWithIdentifier:rawIdentifier];
-    if (modelAsset) {
+    // Otherwise try to retrieve the compiled asset.
+    ETCoreMLAsset *compiledModelAsset = [self assetWithIdentifier:identifier];
+    if (compiledModelAsset) {
         ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier);
     } else {
         ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier);
     }
-
-    [self.assetManager withTemporaryDirectory:^(NSURL * _Nonnull directoryURL) {
-        if (modelAsset) {
-            return;
-        }
-
-        auto modelAssetType = get_model_asset_type(inMemoryFS);
-        if (modelAssetType != ModelAssetType::Model) {
-            return;
-        }
-
-        // The directory specified by `directoryURL` is unique and will be automatically cleaned up
-        // once the enclosing block completes.
-        NSURL *modelURL = ::write_model_files(directoryURL,
-                                              self.fileManager,
-                                              identifier,
-                                              modelAssetType.value(),
-                                              inMemoryFS,
-                                              error);
+    
+    // Create a unique directory for writing model files.
+    NSURL *dstURL = [self.assetManager.trashDirectoryURL URLByAppendingPathComponent:[NSUUID UUID].UUIDString];
+    auto modelAssetType = get_model_asset_type(inMemoryFS);
+    ETCoreMLAsset *modelAsset = nil;
+    // Write the model files.
+    if (modelAssetType == ModelAssetType::Model) {
+        NSURL *modelURL = ::write_model_files(dstURL, self.fileManager, identifier, modelAssetType.value(), inMemoryFS, error);
         if (modelURL) {
-            // Move the model to the asset manager to transfer ownership.
-            modelAsset = [self.assetManager storeAssetAtURL:modelURL withIdentifier:rawIdentifier error:error];
+            modelAsset = make_asset(modelURL,
+                                    identifier,
+                                    self.fileManager,
+                                    error);
         }
-    }];
-
-    return modelAsset;
-}
-
-- (nullable id<ETCoreMLModelExecutor>)modelExecutorWithMetadata:(const ModelMetadata&)metadata
-                                                     inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS
-                                                  configuration:(MLModelConfiguration *)configuration
-                                                          error:(NSError * __autoreleasing *)error {
-    NSError *localError = nil;
-    ETCoreMLAsset *modelAsset = [self modelAssetWithMetadata:metadata inMemoryFS:inMemoryFS error:&localError];
-    if (localError) {
-        if (error) {
-            *error = localError;
-        }
-
-        return nil;
     }
-
-    ETCoreMLAsset *compiledModelAsset = [self compiledModelAssetWithMetadata:metadata
-                                                                    modelURL:modelAsset.contentURL
-                                                                  inMemoryFS:inMemoryFS
-                                                                       error:error];
+   
+    if (!compiledModelAsset) {
+        // Compile the model.
+        NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier
+                                                            inMemoryFS:inMemoryFS
+                                                          assetManager:self.assetManager
+                                                                 error:error];
+        compiledModelAsset = make_asset(compiledModelURL,
+                                        identifier,
+                                        self.fileManager,
+                                        error);
+    }
+    
     if (!compiledModelAsset) {
         return nil;
     }
+    
+    NSError *localError = nil;
+    ETCoreMLModelDebugInfo *debug_info = get_model_debug_info(inMemoryFS, &localError);
+    if (localError) {
+        ETCoreMLLogError(localError, "Failed to parse debug info file");
+    }
+    
 
-    ETCoreMLModelDebugInfo *debug_info = get_model_debug_info(inMemoryFS, error);
-    // The analyzer requires both the raw (uncompiled) asset and the compiled model asset to perform analysis.
     return [[ETCoreMLModelAnalyzer alloc] initWithCompiledModelAsset:compiledModelAsset
                                                           modelAsset:modelAsset
                                                       modelDebugInfo:debug_info
@@ -572,33 +512,41 @@ - (nullable ETCoreMLAsset *)modelAssetWithMetadata:(const ModelMetadata&)metadat
                                                         assetManager:self.assetManager
                                                                error:error];
 }
+
 #else
 - (nullable id<ETCoreMLModelExecutor>)modelExecutorWithMetadata:(const ModelMetadata&)metadata
                                                      inMemoryFS:(const inmemoryfs::InMemoryFileSystem*)inMemoryFS
                                                   configuration:(MLModelConfiguration *)configuration
                                                           error:(NSError * __autoreleasing *)error {
-    ETCoreMLAsset *compiledModelAsset = [self compiledModelAssetWithMetadata:metadata
-                                                                    modelURL:nil
-                                                                  inMemoryFS:inMemoryFS
-                                                                       error:error];
-    if (!compiledModelAsset) {
-        return nil;
+    NSString *identifier = @(metadata.identifier.c_str());
+    // Otherwise try to retrieve the compiled asset.
+    ETCoreMLAsset *asset = [self assetWithIdentifier:identifier];
+    ETCoreMLModel *model = asset ? get_model_from_asset(asset, configuration, metadata, error) : nil;
+    if (model) {
+        ETCoreMLLogInfo("Cache Hit: Successfully retrieved model with identifier=%@ from the models cache.", identifier);
+        return [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model];
     }
-
-    ETCoreMLModel *model = [ETCoreMLModelLoader loadModelWithContentsOfURL:compiledModelAsset.contentURL
-                                                             configuration:configuration
-                                                                  metadata:metadata
-                                                              assetManager:self.assetManager
-                                                                     error:error];
-    if (!model) {
+    
+    ETCoreMLLogInfo("Cache Miss: Model with identifier=%@ was not found in the models cache.", identifier);
+    // Compile the model.
+    NSURL *compiledModelURL = [self compiledModelURLWithIdentifier:identifier
+                                                        inMemoryFS:inMemoryFS
+                                                      assetManager:self.assetManager
+                                                             error:error];
+    if (!compiledModelURL) {
         return nil;
     }
-
+    
+    model = [ETCoreMLModelLoader loadModelWithContentsOfURL:compiledModelURL
+                                              configuration:configuration
+                                                   metadata:metadata
+                                               assetManager:self.assetManager
+                                                      error:error];
+    
     return [[ETCoreMLDefaultModelExecutor alloc] initWithModel:model];
 }
 #endif
 
-
 - (nullable id<ETCoreMLModelExecutor>)_modelExecutorWithAOTData:(NSData *)data
                                                   configuration:(MLModelConfiguration *)configuration
                                                           error:(NSError * __autoreleasing *)error {
@@ -783,7 +731,6 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
                                       args.count);
         return result;
     }
-
     NSError *localError = nil;
     @autoreleasepool {
         NSArray<MLMultiArray *> *inputs = [args subarrayWithRange:NSMakeRange(0, model.orderedInputNames.count)];
@@ -803,11 +750,11 @@ - (BOOL)executeModelWithHandle:(ModelHandle *)handle
             result = YES;
         }
     }
-
-    if (localError && error) {
-        *error = localError;
+    if (!result) {
+        if (error) {
+            *error = localError;
+        }
     }
-
     return result;
 }
 
diff --git a/backends/apple/mps/CMakeLists.txt b/backends/apple/mps/CMakeLists.txt
index 5a253347b01..99a8afa16ac 100644
--- a/backends/apple/mps/CMakeLists.txt
+++ b/backends/apple/mps/CMakeLists.txt
@@ -77,7 +77,7 @@ target_compile_options(mpsdelegate PRIVATE "-fno-objc-arc")
 install(
   TARGETS mpsdelegate mps_schema
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/backends/arm/README.md b/backends/arm/README.md
index e495a8e40cb..0abf5e9bf55 100644
--- a/backends/arm/README.md
+++ b/backends/arm/README.md
@@ -6,7 +6,7 @@ PyTorch models to a TOSA representation. This representation is used to
 deploy to the following targets:
 
 - **Arm&reg; Ethos&trade;-U55/65/85** - Compiled using the Ethos-U Vela compiler.
-- **VGF (Vulkan&reg; Graph Format)** – SPIR-V™ representation for Vulkan-capable devices.
+- **VGF Format, for ML extensions for Vulkan®** – a format containing SPIR-V™ ML operators for Vulkan-capable devices.
 
 The backend provides an ahead-of-time (AOT) flow, that produces a PTE file for your
 chosen target. The AOT flow supports the following development operating systems:
diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index a78ab252739..a737c4bc9de 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -106,3 +106,17 @@ runtime.python_library(
         "//caffe2:torch",
     ]
 )
+runtime.python_library(
+    name = "_factory",
+    srcs = [
+        "util/_factory.py"
+    ],
+    deps = [
+        ":ethosu",
+        ":vgf",
+        ":arm_compile_spec",
+        "//executorch/backends/arm/quantizer:lib",
+        "//executorch/exir/backend:operator_support",
+        "//executorch/exir/backend:compile_spec_schema",
+    ]
+)
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
index f9e23f73cc5..b1337c38a58 100644
--- a/backends/arm/_passes/__init__.py
+++ b/backends/arm/_passes/__init__.py
@@ -27,6 +27,7 @@
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_acosh_pass import DecomposeAcoshPass  # noqa
 from .decompose_adaptive_avg_pool2d_pass import DecomposeAdaptiveAvgPool2dPass  # noqa
+from .decompose_add_sub_alpha_pass import DecomposeAddSubAlphaPass  # noqa
 from .decompose_addmm_pass import DecomposeAddmmPass  # noqa
 from .decompose_asin_and_acos_pass import DecomposeAsinAndAcosPass  # noqa
 from .decompose_asinh_pass import DecomposeAsinhPass  # noqa
@@ -46,6 +47,9 @@
 from .decompose_glu_pass import DecomposeGluPass  # noqa
 from .decompose_grouped_conv import DecomposeGroupedConv  # noqa
 from .decompose_groupnorm_pass import DecomposeGroupNormPass  # noqa
+from .decompose_int16_activation_conv2d_pass import (  # noqa
+    DecomposeConv2dWithInt16ActivationPass,
+)
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
@@ -78,7 +82,7 @@
 from .insert_int32_casts_after_int64_placeholders import (  # noqa
     InsertInt32CastsAfterInt64PlaceholdersPass,
 )
-from .insert_rescales_pass import InsertRescalePass  # noqa
+from .insert_rescales_pass import InsertRescaleInt32Pass, InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
 from .match_arg_dtype_pass import MatchArgDtypePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
@@ -88,6 +92,8 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
 )
+from .rewrite_matmul import RewriteMatmulPass  # noqa
+from .rewrite_upsample import RewriteUpsamplePass  # noqa
 from .scalars_to_attribute_pass import ScalarsToAttributePass  # noqa
 from .size_adjust_input_pass import SizeAdjustInputPass  # noqa
 from .to_tosa_memory_format_pass import ToTosaMemoryFormatPass  # noqa
diff --git a/backends/arm/_passes/_debug_passes.py b/backends/arm/_passes/_debug_passes.py
index 7809885d465..4c1661e50a9 100644
--- a/backends/arm/_passes/_debug_passes.py
+++ b/backends/arm/_passes/_debug_passes.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.devtools.visualization.visualization_utils import visualize_graph
 from executorch.exir import ExportedProgram
@@ -14,6 +16,8 @@ class VisualizePass(ExportPass):
     This pass visualizes the graph at the point of insertion in the pass manager
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program
diff --git a/backends/arm/_passes/add_bias_pass.py b/backends/arm/_passes/add_bias_pass.py
index 31c0c0505cb..fd5476f51b8 100644
--- a/backends/arm/_passes/add_bias_pass.py
+++ b/backends/arm/_passes/add_bias_pass.py
@@ -3,13 +3,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import create_constant_placeholder
 
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 from torch.export.graph_signature import InputKind
 
 
@@ -19,6 +22,8 @@ class AddBiasPass(ArmPass):
     The bias is set to zero.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     targeted_ops = (exir_ops.edge.aten.convolution.default,)
 
     def call(self, graph_module):
@@ -55,6 +60,10 @@ def call(self, graph_module):
                         persistent_buffer=True,
                         name=f"{node.name}_bias",
                     )
+                    if node.args[0].meta["val"].dtype == torch.int16:
+                        bias_node.meta[TosaSpecialDtype.meta_key()] = (
+                            TosaSpecialDtype.INT48
+                        )
                 node.update_arg(2, bias_node)
 
         if modified:
diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py
index 8156ca0b89d..72ae46c76c1 100644
--- a/backends/arm/_passes/annotate_decomposed_matmul.py
+++ b/backends/arm/_passes/annotate_decomposed_matmul.py
@@ -7,10 +7,13 @@
 
 import itertools
 import operator
-from typing import cast, List
+from typing import cast, List, Set, Type
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    FoldAndAnnotateQParamsPass,
+)
 
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -29,6 +32,8 @@ class AnnotateDecomposedMatmulPass(ExportPass):
     matmul-op (can be mm or bmm).
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {FoldAndAnnotateQParamsPass}
+
     def _match_partition_to_node(
         self, node: torch.fx.Node, partitioned_inputs: List[torch.fx.Node]
     ) -> torch.fx.Node:
@@ -68,7 +73,10 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 node for node in partition.nodes if node.target in matmul_targets
             ][0]
 
-            if quantized_input:
+            if quantized_input and not all(
+                input_node.target in DQ_OPS
+                for input_node in matmul_node.all_input_nodes
+            ):
                 matmul_args = matmul_node.all_input_nodes
                 for node in matmul_args:
                     # Find the dq-node connected to this mm/bmm arg
@@ -94,7 +102,9 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target in Q_OPS
-            if quantized_output:
+            if quantized_output and not all(
+                user.target in Q_OPS for user in matmul_node.users
+            ):
                 with graph_module.graph.inserting_after(matmul_node):
                     # Create q-node after matmul
                     q_node = create_node(
diff --git a/backends/arm/_passes/annotate_output_dim_order_pass.py b/backends/arm/_passes/annotate_output_dim_order_pass.py
index 08f93383a9c..8dc13326e4a 100644
--- a/backends/arm/_passes/annotate_output_dim_order_pass.py
+++ b/backends/arm/_passes/annotate_output_dim_order_pass.py
@@ -3,9 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_output_dim_orders
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 
 
 class AnnotateOutputDimOrderPass(ArmPass):
@@ -14,6 +17,8 @@ class AnnotateOutputDimOrderPass(ArmPass):
     for verifying that the dim order does not change unexpectedly in later passes.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call(self, graph_module):
         output_node = graph_module.graph.output_node()
         output_node.meta["original_dim_orders"] = get_output_dim_orders(graph_module)
diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py
index 085267a174e..c76b5d157a7 100644
--- a/backends/arm/_passes/arm_pass.py
+++ b/backends/arm/_passes/arm_pass.py
@@ -6,7 +6,8 @@
 # pyre-unsafe
 
 import traceback
-from typing import Optional
+from abc import abstractmethod
+from typing import List, Optional, Set, Type
 
 import torch
 from executorch.exir.pass_base import ExportPass, NodeMetadata
@@ -19,6 +20,36 @@ def __init__(self, exported_program: Optional[torch.export.ExportedProgram] = No
         super(ArmPass, self).__init__()
         self.exported_program = exported_program
 
+    @property
+    @abstractmethod
+    def _passes_required_after(self) -> Set[Type[ExportPass]]:
+        """The subclass defines passes that must run after it"""
+        pass
+
+    @staticmethod
+    def get_required_passes(pass_) -> List[str]:
+        """
+        Returns the list of passes that must be run after this pass, sorted by name.
+        """
+        if hasattr(pass_, "_passes_required_after"):
+            return sorted([ArmPass.get_name(p) for p in pass_._passes_required_after])
+        else:
+            return []
+
+    @staticmethod
+    def get_name(pass_) -> str:
+        """
+        Returns the name of the pass.
+        """
+        if isinstance(pass_, ExportPass):
+            return pass_.__class__.__name__
+        elif hasattr(pass_, "__name__"):
+            return pass_.__name__
+        else:
+            raise ValueError(
+                f"Cannot get name for pass: {pass_}. It must be an instance of ExportPass or have a __name__ attribute."
+            )
+
     def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False):
         if not updated:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index f49206da67e..325f667f0ac 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -7,6 +7,9 @@
 
 # pyre-unsafe
 
+
+from collections import defaultdict
+
 import executorch.backends.arm.tosa.dialect  # noqa: unused
 from executorch.backends.arm._passes import (
     AddBiasPass,
@@ -33,12 +36,14 @@
     DecomposeAcoshPass,
     DecomposeAdaptiveAvgPool2dPass,
     DecomposeAddmmPass,
+    DecomposeAddSubAlphaPass,
     DecomposeAsinAndAcosPass,
     DecomposeAsinhPass,
     DecomposeAtanhPass,
     DecomposeAtanPass,
     DecomposeAvgPool2d,
     DecomposeBatchNormNoStatsPass,
+    DecomposeConv2dWithInt16ActivationPass,
     DecomposeCoshPass,
     DecomposeCosineSimilarityPass,
     DecomposeCumsumPass,
@@ -77,6 +82,7 @@
     FuseEqualPlaceholdersPass,
     FuseQuantizedActivationPass,
     InsertInt32CastsAfterInt64PlaceholdersPass,
+    InsertRescaleInt32Pass,
     InsertRescalePass,
     InsertTableOpsPass,
     MatchArgDtypePass,
@@ -87,6 +93,8 @@
     ReplaceScalarWithTensorArgPassTOSABI,
     ReplaceScalarWithTensorArgPassTOSAMI,
     RetraceFoldedDtypesPass,
+    RewriteMatmulPass,
+    RewriteUpsamplePass,
     ScalarsToAttributePass,
     SizeAdjustInputPass,
     ToTosaMemoryFormatPass,
@@ -94,6 +102,7 @@
     UnsqueezeScalarPlaceholdersPass,
 )
 
+from executorch.backends.arm._passes.arm_pass import ArmPass
 from executorch.backends.arm.tosa.specification import (
     TosaLoweringContext,
     TosaSpecification,
@@ -107,6 +116,8 @@
 from executorch.exir.pass_manager import PassManager
 from executorch.exir.passes.remove_graph_asserts_pass import RemoveGraphAssertsPass
 from torch.fx import GraphModule
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.nn.modules import Module
 
 
 class ArmPassManager(PassManager):
@@ -115,6 +126,32 @@ def __init__(self, tosa_spec: TosaSpecification) -> None:
         self.tosa_spec = tosa_spec
         super().__init__()
 
+    def validate_constraints_mandatory(self):
+        """
+        Validates that necessary passes have run before transforming to backend.
+
+        Note that this differs from the original validate_constraints function, which
+        only checks the order of passes.
+        """
+        passes_to_run = defaultdict(list)
+
+        for current_pass in self.passes:
+            current_pass_name = ArmPass.get_name(current_pass)
+            for required_pass_name in ArmPass.get_required_passes(current_pass):
+                passes_to_run[required_pass_name].append(current_pass_name)
+
+            passes_to_run.pop(current_pass_name, None)
+
+        if len(passes_to_run) > 0:
+            error_msg = "The following constraints for passes are not met:\n"
+            for required_pass, requiring_passes in passes_to_run.items():
+                for requiring_pass in requiring_passes:
+                    error_msg += (
+                        f"  - {required_pass} must run after {requiring_pass}\n"
+                    )
+
+            raise RuntimeError(error_msg)
+
     def _transform(self, graph_module: GraphModule):
         with TosaLoweringContext(self.tosa_spec):
             return self(graph_module).graph_module
@@ -125,7 +162,6 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
-        self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(
             DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
         )
@@ -154,6 +190,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(DecomposeGroupedConv())
+
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
@@ -167,14 +204,23 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
 
         self.add_pass(FuseViewCopyTransform())
         self.add_pass(FuseConstantArgsPass(exported_program))
+        self.add_pass(InsertTableOpsPass(exported_program))
+        # If we have a conv2d with int16 activation split up into a convolution
+        # and an addition, to work-around the lack of support for int48 in torch
+        # needs to happen before AddBiasPass, but after the table ops are inserted
+        # to be able to validate that conv2d has right dtype arguments.
+        self.add_pass(DecomposeConv2dWithInt16ActivationPass())
+        self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
 
-        self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
         self.add_pass(InsertRescalePass())
+        self.add_pass(InsertRescaleInt32Pass())
 
+        self.validate_constraints_mandatory()
         return self._transform(exported_program.graph_module)
 
     def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
@@ -217,6 +263,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         )
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeAddSubAlphaPass())
         self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(DecomposeGeluPass())
         self.add_pass(ConvertFullLikeToFullPass())
@@ -251,13 +298,16 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(FuseViewCopyTransform())
         self.add_pass(FuseConstantArgsPass(exported_program))
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
+        self.add_pass(RewriteUpsamplePass(exported_program))
         self.add_pass(AddBiasPass(exported_program))
         self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(RewriteMatmulPass(exported_program))
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
         self.add_pass(InsertRescalePass())
 
+        self.validate_constraints_mandatory()
         return self._transform(exported_program.graph_module)
 
     def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
@@ -286,6 +336,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeSignPass())
         self.add_pass(DecomposeAddmmPass())
         self.add_pass(DecomposeDivTensorModePass())
+        self.add_pass(DecomposeAddSubAlphaPass())
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeGroupNormPass())
@@ -317,3 +368,20 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
             self.add_pass(DecomposeMaskedFill())
 
         return self._transform(graph_module)
+
+    def __call__(self, module: Module) -> PassResult:
+        try:
+            return super().__call__(module)
+        except Exception as e:
+            first_exception = e.__cause__ or e.__context__ or e
+            import re
+
+            message = e.args[0]
+            m = re.search(r"An error occurred when running the '([^']+)' pass", message)
+            if m:
+                pass_name = m.group(1)
+                first_exception.args = (
+                    f"{pass_name}: {first_exception.args[0]}",
+                    *first_exception.args[1:],
+                )
+            raise first_exception
diff --git a/backends/arm/_passes/broadcast_args_pass.py b/backends/arm/_passes/broadcast_args_pass.py
index f125ba13ff4..659e6aca686 100644
--- a/backends/arm/_passes/broadcast_args_pass.py
+++ b/backends/arm/_passes/broadcast_args_pass.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
 
 from executorch.backends.arm._passes.arm_pass_utils import (
@@ -12,7 +14,7 @@
 
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule, Node
 
 
@@ -22,6 +24,8 @@ class BroadcastArgsPass(ArmPass):
     This is done when more than one arg needs broadcasting.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     targeted_ops = {
         exir_ops.edge.aten.add.Tensor,
         exir_ops.edge.aten.sub.Tensor,
diff --git a/backends/arm/_passes/cast_bool_to_int8_pass.py b/backends/arm/_passes/cast_bool_to_int8_pass.py
index 1352671b01e..771b6d9e174 100644
--- a/backends/arm/_passes/cast_bool_to_int8_pass.py
+++ b/backends/arm/_passes/cast_bool_to_int8_pass.py
@@ -6,6 +6,8 @@
 # The TOSA BITWISE_AND, BITWISE_OR, and BITWISE_XOR don't handle bool as input
 # If input/output is bool lest add a cast/conversion pass before/after to/from int8.
 
+from typing import Set, Type
+
 import torch
 
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -15,6 +17,8 @@
 class CastBoolToInt8Pass(ExportPass):
     """Casts the input to int8 if it is not already and casts back the output to the original input dtype."""
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     targeted_ops = {
         exir_ops.edge.aten.bitwise_and.Tensor,
         exir_ops.edge.aten.bitwise_or.Tensor,
diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py
index 8052c8fd2ce..d7b2a6b6b43 100644
--- a/backends/arm/_passes/cast_int64_pass.py
+++ b/backends/arm/_passes/cast_int64_pass.py
@@ -6,6 +6,7 @@
 # pyre-unsafe
 
 import logging
+from typing import Set, Type
 
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -19,6 +20,8 @@ class CastInt64BuffersToInt32Pass(ExportPass):
     Cast int64 buffers to int32 if the int64 data is in int32 range.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: torch.export.ExportedProgram):
         super(CastInt64BuffersToInt32Pass, self).__init__()
         self.exported_program = exported_program
diff --git a/backends/arm/_passes/cast_to_int32_pass.py b/backends/arm/_passes/cast_to_int32_pass.py
index c4b009e2b88..2e574568235 100644
--- a/backends/arm/_passes/cast_to_int32_pass.py
+++ b/backends/arm/_passes/cast_to_int32_pass.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -12,6 +14,8 @@
 class CastToInt32Pass(ExportPass):
     """Casts the input to int32 if it is not already and casts back the output to the original input dtype."""
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     targeted_ops = {
         exir_ops.edge.aten.bitwise_left_shift.Tensor,
         exir_ops.edge.aten.bitwise_right_shift.Tensor,
diff --git a/backends/arm/_passes/conv1d_unsqueeze_pass.py b/backends/arm/_passes/conv1d_unsqueeze_pass.py
index 56f674e9066..b228da6766f 100644
--- a/backends/arm/_passes/conv1d_unsqueeze_pass.py
+++ b/backends/arm/_passes/conv1d_unsqueeze_pass.py
@@ -6,6 +6,11 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from typing import Set, Type
+
+from executorch.backends.arm._passes.add_bias_pass import AddBiasPass
+from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
+
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -21,6 +26,8 @@ class Conv1dUnsqueezePass(ExportPass):
     3) squeeze the output back down to 3d.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {AddBiasPass, SizeAdjustInputPass}
+
     def call_operator(self, op, args, kwargs, meta):
         if op != exir_ops.edge.aten.convolution.default:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/convert_any_default_dim_dims_pass.py b/backends/arm/_passes/convert_any_default_dim_dims_pass.py
index 7085f17add0..8c8e5086b6d 100644
--- a/backends/arm/_passes/convert_any_default_dim_dims_pass.py
+++ b/backends/arm/_passes/convert_any_default_dim_dims_pass.py
@@ -3,7 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.convert_squeezes_to_view import (
+    ConvertSqueezesToViewPass,
+)
 from executorch.exir.dialects._ops import (  # type: ignore[import-not-found]
     ops as exir_ops,
 )
@@ -44,6 +49,8 @@ class ConvertAnyDefaultDimDimsPass(ExportPass):
         squeeze(dim = [dim1, dim2])
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {ConvertSqueezesToViewPass}
+
     def call(self, graph_module: torch.fx.GraphModule):
         modified = False
         for node in graph_module.graph.nodes:
diff --git a/backends/arm/_passes/convert_expand_copy_to_repeat.py b/backends/arm/_passes/convert_expand_copy_to_repeat.py
index ee509c7ebb5..83b47d31755 100644
--- a/backends/arm/_passes/convert_expand_copy_to_repeat.py
+++ b/backends/arm/_passes/convert_expand_copy_to_repeat.py
@@ -6,10 +6,13 @@
 # pyre-unsafe
 
 import logging
-from typing import cast
+from typing import cast, Set, Type
 
 import torch
 
+from executorch.backends.arm._passes.unsqueeze_before_repeat_pass import (
+    UnsqueezeBeforeRepeatPass,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -50,6 +53,8 @@ class ConvertExpandCopyToRepeatPass(ExportPass):
     Replace expand copy with repeat since it is a repeat that can only repeat singleton dimensions.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {UnsqueezeBeforeRepeatPass}
+
     expand_copy = exir_ops.edge.aten.expand_copy.default
     repeat = exir_ops.edge.aten.repeat.default
 
diff --git a/backends/arm/_passes/convert_full_like_to_full_pass.py b/backends/arm/_passes/convert_full_like_to_full_pass.py
index 234e2ecda82..06822a4abcf 100644
--- a/backends/arm/_passes/convert_full_like_to_full_pass.py
+++ b/backends/arm/_passes/convert_full_like_to_full_pass.py
@@ -3,11 +3,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
+
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
 
-class ConvertFullLikeToFullPass(ExportPass):
+class ConvertFullLikeToFullPass(ArmPass):
     """As per the full_like pytorch documentation,
     `torch.full_like(input, fill_value)` is equivalent to
     `torch.full(input.size(),
@@ -19,6 +24,8 @@ class ConvertFullLikeToFullPass(ExportPass):
     Skip layout and device since it's not relevant for our backend.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOT}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in [
             exir_ops.edge.aten.full_like.default,
diff --git a/backends/arm/_passes/convert_int64_const_ops_to_int32.py b/backends/arm/_passes/convert_int64_const_ops_to_int32.py
index 704c89dbd78..2bf305a13f6 100644
--- a/backends/arm/_passes/convert_int64_const_ops_to_int32.py
+++ b/backends/arm/_passes/convert_int64_const_ops_to_int32.py
@@ -7,6 +7,7 @@
 
 
 import logging
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
@@ -30,6 +31,8 @@ class ConvertInt64ConstOpsToInt32Pass(ExportPass):
       5. `torch.tensor`
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOT}
+
     torch_ops = [
         torch.ops.aten.full.default,
         torch.ops.aten.arange.default,
diff --git a/backends/arm/_passes/convert_int64_output_ops_to_int32.py b/backends/arm/_passes/convert_int64_output_ops_to_int32.py
index 788201be6c8..d0d29d14e30 100644
--- a/backends/arm/_passes/convert_int64_output_ops_to_int32.py
+++ b/backends/arm/_passes/convert_int64_output_ops_to_int32.py
@@ -7,6 +7,7 @@
 
 
 import logging
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
@@ -44,6 +45,8 @@ class ConvertInt64OutputOpsToInt32Pass(ExportPass):
     the int32 range.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     aten_cast_ops = (
         torch.ops.aten.to.dtype,
         torch.ops.aten.to.dtype_layout,
diff --git a/backends/arm/_passes/convert_int_pow_to_mul.py b/backends/arm/_passes/convert_int_pow_to_mul.py
index f22a2fd0b3c..8f9b3a9cb4b 100644
--- a/backends/arm/_passes/convert_int_pow_to_mul.py
+++ b/backends/arm/_passes/convert_int_pow_to_mul.py
@@ -5,8 +5,11 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 class ConvertIntPowToMuls(ArmPass):
@@ -16,6 +19,8 @@ class ConvertIntPowToMuls(ArmPass):
     Needs to be run before doing scalar to tensor conversion.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op != exir_ops.edge.aten.pow.Tensor_Scalar:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/convert_minmax_pass.py b/backends/arm/_passes/convert_minmax_pass.py
index 9f409632c20..79bb6e2db0c 100644
--- a/backends/arm/_passes/convert_minmax_pass.py
+++ b/backends/arm/_passes/convert_minmax_pass.py
@@ -3,7 +3,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import cast, Set, Type
+
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm._passes.convert_squeezes_to_view import (
+    ConvertSqueezesToViewPass,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -29,6 +35,8 @@ class ConvertMinMaxPass(ExportPass):
         squeeze(dim = [dim1, dim2])
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {ConvertSqueezesToViewPass}
+
     def check_argmax(self, node):
         """
         Raises a RuntimeError if the argmax value returned by the min/max op is used in the graph.
@@ -94,20 +102,28 @@ def call(self, graph_module: torch.fx.GraphModule):
             replace_node, op, squeeze_op = self.get_variables(node)
 
             # Unwrap args
-            if len(node.args) == 2:
+            if len(node.args) == 1:
+                # If dims is unspecified, min/max over all dims.
+                input_node = cast(torch.fx.Node, node.args[0])
+                input_shape = get_first_fake_tensor(input_node).shape
+                dims = range(len(input_shape))
+                keepdims = False
+            elif len(node.args) == 2:
                 input_node, dims = node.args
                 keepdims = False
             elif len(node.args) == 3:
                 input_node, dims, keepdims = node.args
             else:
-                raise RuntimeError(f"Unexpected arg size in {node.name}")
+                raise RuntimeError(
+                    f"Unexpected arg size {len(node.args)} in {node.name}"
+                )
 
             try:
-                iter(dims)
-            except:
-                dims = [dims]
+                iter(dims)  # type:ignore[assignment]
+            except Exception:
+                dims = [dims]  # type:ignore[assignment]
             else:
-                dims = list(dims)
+                dims = list(dims)  # type:ignore[assignment]
 
             # Unroll multi-dimensional reduction and keep-dims arg
             with graph_module.graph.inserting_before(node):
diff --git a/backends/arm/_passes/convert_split_to_slice.py b/backends/arm/_passes/convert_split_to_slice.py
index 67bd9d73e81..7578c07ca53 100644
--- a/backends/arm/_passes/convert_split_to_slice.py
+++ b/backends/arm/_passes/convert_split_to_slice.py
@@ -5,6 +5,8 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
@@ -19,6 +21,8 @@ class ConvertSplitToSlicePass(ExportPass):
     Replace a split operation with many slice operations.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     split_ops = (
         exir_ops.edge.aten.split_with_sizes_copy.default,
         exir_ops.edge.aten.split_copy.Tensor,
diff --git a/backends/arm/_passes/convert_squeezes_to_view.py b/backends/arm/_passes/convert_squeezes_to_view.py
index 889dbe74172..70f4625f0ff 100644
--- a/backends/arm/_passes/convert_squeezes_to_view.py
+++ b/backends/arm/_passes/convert_squeezes_to_view.py
@@ -6,6 +6,10 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
+from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
+
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -15,6 +19,8 @@ class ConvertSqueezesToViewPass(ExportPass):
     Replaces squeeze/unsqueeze operators with view. These are simply special cases of the view op, so removing them gives us less cases to handle in the node visitiors.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {FuseViewCopyTransform}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in [
             exir_ops.edge.aten.squeeze_copy.dims,
diff --git a/backends/arm/_passes/convert_to_clamp.py b/backends/arm/_passes/convert_to_clamp.py
index 8f2c9b16f9a..0199d6798bc 100644
--- a/backends/arm/_passes/convert_to_clamp.py
+++ b/backends/arm/_passes/convert_to_clamp.py
@@ -3,7 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Tuple
+from typing import Set, Tuple, Type
+
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    QuantizeOperatorArguments,
+)
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -24,6 +28,8 @@ def get_clamp_params(op, args) -> Tuple[float | None, float | None]:
 
 
 class ConvertToClampPass(ExportPass):
+    _passes_required_after: Set[Type[ExportPass]] = {QuantizeOperatorArguments}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in edge_operators:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_acosh_pass.py b/backends/arm/_passes/decompose_acosh_pass.py
index 1d92dd68c4a..509849fce4e 100644
--- a/backends/arm/_passes/decompose_acosh_pass.py
+++ b/backends/arm/_passes/decompose_acosh_pass.py
@@ -5,8 +5,18 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass  # noqa
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 # For MI case
 edge_acosh_op = exir_ops.edge.aten.acosh.default
@@ -19,6 +29,14 @@ class DecomposeAcoshPass(ArmPass):
         acosh(x) = log(x + sqrt((x-1)(x+1))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSqrtPass,
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+        MatchArgDtypePass,
+    }
+
     def call_operator(self, op, args, kwargs, meta, updated=False):
 
         if op is not edge_acosh_op:
diff --git a/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py b/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py
index abfcc8e3945..52ddb77151d 100644
--- a/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py
+++ b/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py
@@ -4,12 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 from math import ceil, floor
+from typing import Set, Type
 
 import torch
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_avg_pool2d import DecomposeAvgPool2d
 
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 edge_ops = (exir_ops.edge.aten._adaptive_avg_pool2d.default,)
 aten_ops = (torch.ops.aten.adaptive_avg_pool2d.default,)
@@ -41,6 +44,8 @@ class DecomposeAdaptiveAvgPool2dPass(ArmPass):
     The output is of size output_size_h x output_size_w for any input.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {DecomposeAvgPool2d}
+
     def call_operator(self, op, args, kwargs, meta, updated=False):
         if op not in (edge_ops + aten_ops):
             return super().call_operator(op, args, kwargs, meta, updated)
diff --git a/backends/arm/_passes/decompose_add_sub_alpha_pass.py b/backends/arm/_passes/decompose_add_sub_alpha_pass.py
new file mode 100644
index 00000000000..c0ed1bae09b
--- /dev/null
+++ b/backends/arm/_passes/decompose_add_sub_alpha_pass.py
@@ -0,0 +1,94 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import annotations
+
+import numbers
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+_ADD_OPS = (
+    exir_ops.edge.aten.add.Tensor,
+    torch.ops.aten.add.Tensor,
+)
+
+_SUB_OPS = (
+    exir_ops.edge.aten.sub.Tensor,
+    torch.ops.aten.sub.Tensor,
+)
+
+
+def _get_ops(op):
+    if op in _ADD_OPS:
+        if op is exir_ops.edge.aten.add.Tensor:
+            return (
+                exir_ops.edge.aten.mul.Tensor,
+                exir_ops.edge.aten.full.default,
+                exir_ops.edge.aten.add.Tensor,
+            )
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.full.default,
+            torch.ops.aten.add.Tensor,
+        )
+    if op in _SUB_OPS:
+        if op is exir_ops.edge.aten.sub.Tensor:
+            return (
+                exir_ops.edge.aten.mul.Tensor,
+                exir_ops.edge.aten.full.default,
+                exir_ops.edge.aten.sub.Tensor,
+            )
+        return (
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.full.default,
+            torch.ops.aten.sub.Tensor,
+        )
+    raise RuntimeError(f"Unsupported operator {op}")
+
+
+def _should_decompose(alpha) -> bool:
+    if isinstance(alpha, numbers.Number):
+        return alpha != 1
+    return False
+
+
+class DecomposeAddSubAlphaPass(ArmPass):
+    """Rewrite add/sub with alpha into a mul followed by add/sub."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call_operator(self, op, args, kwargs, meta, updated: bool | None = False):
+        if op not in _ADD_OPS + _SUB_OPS:
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        alpha = kwargs.get("alpha", 1)
+        if not _should_decompose(alpha):
+            return super().call_operator(op, args, kwargs, meta, updated)
+
+        mul_op, full_op, binary_op = _get_ops(op)
+        lhs, rhs = args
+
+        alpha_full = super().call_operator(
+            full_op, ((1,), float(alpha)), {}, meta, updated=True
+        )
+        scaled_rhs = super().call_operator(
+            mul_op,
+            (rhs, alpha_full),
+            {},
+            meta,
+            updated=True,
+        )
+        return super().call_operator(
+            binary_op,
+            (lhs, scaled_rhs),
+            {},
+            meta,
+            updated=True,
+        )
diff --git a/backends/arm/_passes/decompose_addmm_pass.py b/backends/arm/_passes/decompose_addmm_pass.py
index b59a8cb02d3..a95c1cc7fec 100644
--- a/backends/arm/_passes/decompose_addmm_pass.py
+++ b/backends/arm/_passes/decompose_addmm_pass.py
@@ -3,10 +3,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 # For MI case
@@ -36,6 +42,12 @@ def get_ops(op):
 class DecomposeAddmmPass(ArmPass):
     """Decomposes the addmm operator into tensor multiplication and addition."""
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ConvertMmToBmmPass,
+        MatchArgRanksPass,
+        MatchArgDtypePass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in [edge_addmm, aten_addmm]:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_asin_and_acos_pass.py b/backends/arm/_passes/decompose_asin_and_acos_pass.py
index e067f17b0ca..5b1c575e9c9 100644
--- a/backends/arm/_passes/decompose_asin_and_acos_pass.py
+++ b/backends/arm/_passes/decompose_asin_and_acos_pass.py
@@ -7,11 +7,23 @@
 
 import logging
 from math import pi
+from typing import Set, Type
 
 import torch
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
+    ConvertFullLikeToFullPass,
+)
+from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 # For MI case
 edge_asin_op = (exir_ops.edge.aten.asin.default,)
@@ -54,6 +66,15 @@ class DecomposeAsinAndAcosPass(ArmPass):
 
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSqrtPass,
+        DecomposeDivPass,
+        ConvertFullLikeToFullPass,
+        MatchArgRanksPass,
+        MatchArgDtypePass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+    }
+
     def _build_polynomial(
         self, coefficients: list[float], variable: torch.Tensor, meta: dict[str, str]
     ) -> torch.Tensor:
diff --git a/backends/arm/_passes/decompose_asinh_pass.py b/backends/arm/_passes/decompose_asinh_pass.py
index a0b78c51a77..088230ca4b2 100644
--- a/backends/arm/_passes/decompose_asinh_pass.py
+++ b/backends/arm/_passes/decompose_asinh_pass.py
@@ -6,8 +6,18 @@
 # pyre-unsafe
 
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 # For MI case
 edge_asinh_op = (exir_ops.edge.aten.asinh.default,)
@@ -20,6 +30,14 @@ class DecomposeAsinhPass(ArmPass):
         asinh(x) = log(x + sqrt(x^2 + 1))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSqrtPass,
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+        MatchArgDtypePass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in edge_asinh_op:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_atan_pass.py b/backends/arm/_passes/decompose_atan_pass.py
index 57b9dde5216..03ed62e7870 100644
--- a/backends/arm/_passes/decompose_atan_pass.py
+++ b/backends/arm/_passes/decompose_atan_pass.py
@@ -5,9 +5,17 @@
 
 import logging
 from math import pi
+from typing import Set, Type
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 edge_atan = exir_ops.edge.aten.atan.default  # MI case
@@ -35,6 +43,13 @@ def _get_atan_ops(op):
 class DecomposeAtanPass(ArmPass):
     """Decomposes the atan operator into a rational (Padé) approximation."""
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        MatchArgDtypePass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+    }
+
     def _rational_approximation(self, z, ops, meta):
         """Creates a (2,1) Padé approximation for atan(x) on [-1, 1]."""
 
diff --git a/backends/arm/_passes/decompose_atanh_pass.py b/backends/arm/_passes/decompose_atanh_pass.py
index dfdad41e556..2c8347e7e9f 100644
--- a/backends/arm/_passes/decompose_atanh_pass.py
+++ b/backends/arm/_passes/decompose_atanh_pass.py
@@ -3,8 +3,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 edge_atanh = exir_ops.edge.aten.atanh.default  # MI case
@@ -30,6 +39,13 @@ class DecomposeAtanhPass(ArmPass):
     atanh(x) = 0.5 * log((1 + x) / (1 - x))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        MatchArgDtypePass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op is not edge_atanh:
             return super().call_operator(op, args, kwargs, meta, updated=False)
diff --git a/backends/arm/_passes/decompose_avg_pool2d.py b/backends/arm/_passes/decompose_avg_pool2d.py
index 21ed6b518c7..bbb8ceba129 100644
--- a/backends/arm/_passes/decompose_avg_pool2d.py
+++ b/backends/arm/_passes/decompose_avg_pool2d.py
@@ -4,7 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.backends.arm.operators.operator_validation_utils import (
     adjust_pooling_pad_if_needed,
 )
@@ -30,11 +33,11 @@ def get_decomposition(op) -> tuple:
             torch.ops.aten.avg_pool2d.default,
             torch.ops.aten.mul.Tensor,
         )
-    raise RuntimeError(f"Can't get div decomposition for op {op}")
+    raise RuntimeError(f"Can't get avg_pool2d decomposition for op {op}")
 
 
 class DecomposeAvgPool2d(ExportPass):
-    """ """
+    _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOT}
 
     def call_operator(self, op, args, kwargs, meta):
         if op not in (edge_div_ops + aten_div_ops):
diff --git a/backends/arm/_passes/decompose_batch_norm_no_stats.py b/backends/arm/_passes/decompose_batch_norm_no_stats.py
index 5fdb8db2d7c..b18bd4d9ac8 100644
--- a/backends/arm/_passes/decompose_batch_norm_no_stats.py
+++ b/backends/arm/_passes/decompose_batch_norm_no_stats.py
@@ -6,12 +6,16 @@
 # pyre-unsafe
 
 import operator
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
+
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 
 
 class DecomposeBatchNormNoStatsPass(ArmPass):
@@ -33,6 +37,11 @@ class DecomposeBatchNormNoStatsPass(ArmPass):
     Source: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ComputeConstantOpsAOT,
+        InsertTableOpsPass,
+    }
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
         bn_ops = (
             exir_ops.edge.aten._native_batch_norm_legit.no_stats,
diff --git a/backends/arm/_passes/decompose_cosh_pass.py b/backends/arm/_passes/decompose_cosh_pass.py
index a94cf9ecff0..cbfbd5783e2 100644
--- a/backends/arm/_passes/decompose_cosh_pass.py
+++ b/backends/arm/_passes/decompose_cosh_pass.py
@@ -3,8 +3,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 # For MI case
 edge_cosh = exir_ops.edge.aten.cosh.default
@@ -19,6 +28,13 @@ class DecomposeCoshPass(ArmPass):
 
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+        MatchArgDtypePass,
+    }
+
     def call_operator(self, op, args, kwargs, meta, updated=False):
         if op is not edge_cosh:
             return super().call_operator(op, args, kwargs, meta, updated)
diff --git a/backends/arm/_passes/decompose_cosine_similarity_pass.py b/backends/arm/_passes/decompose_cosine_similarity_pass.py
index 9978e653408..965dad54697 100644
--- a/backends/arm/_passes/decompose_cosine_similarity_pass.py
+++ b/backends/arm/_passes/decompose_cosine_similarity_pass.py
@@ -3,7 +3,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
+    ConvertFullLikeToFullPass,
+)
+
+from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.pass_base import ExportPass
 
 torch_cosine_similarity = (torch.ops.aten.cosine_similarity.default,)
@@ -22,6 +31,13 @@ class DecomposeCosineSimilarityPass(ExportPass):
       out    = div(dot, denom)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeDivPass,
+        DecomposeSumPass,
+        ConvertFullLikeToFullPass,
+        InsertTableOpsPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in torch_cosine_similarity:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_cumsum_pass.py b/backends/arm/_passes/decompose_cumsum_pass.py
index 155ccd11594..32c59f6d793 100644
--- a/backends/arm/_passes/decompose_cumsum_pass.py
+++ b/backends/arm/_passes/decompose_cumsum_pass.py
@@ -4,15 +4,17 @@
 # LICENSE file in the root directory of this source tree.
 
 from math import prod
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.add_bias_pass import AddBiasPass
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.backends.arm._passes.quant_args import QuantArgs
 
 from executorch.backends.transforms.utils import create_constant_placeholder
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 from torch.export.graph_signature import InputKind
 
 
@@ -39,6 +41,8 @@ class DecomposeCumsumPass(ArmPass):
     And the convolution is applied over dimension H.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {AddBiasPass}
+
     def call(self, graph_module):
         graph = graph_module.graph
         targets = (exir_ops.edge.aten.cumsum.default, torch.ops.aten.cumsum.default)
diff --git a/backends/arm/_passes/decompose_div_pass.py b/backends/arm/_passes/decompose_div_pass.py
index 893531dac69..b6db103930e 100644
--- a/backends/arm/_passes/decompose_div_pass.py
+++ b/backends/arm/_passes/decompose_div_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -6,7 +6,10 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -37,6 +40,8 @@ class DecomposeDivPass(ExportPass):
         y = mul(a,x)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (edge_div_ops + aten_div_ops):
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_div_tensor_mode.py b/backends/arm/_passes/decompose_div_tensor_mode.py
index 0e6b40afbb2..b5352475d51 100644
--- a/backends/arm/_passes/decompose_div_tensor_mode.py
+++ b/backends/arm/_passes/decompose_div_tensor_mode.py
@@ -5,7 +5,10 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -48,6 +51,8 @@ class DecomposeDivTensorModePass(ExportPass):
     rounding_mode='trunc' -> where(div(a,b) < 0, ceil(div(a,b)), floor(div(a,b)))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivPass}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (edge_div_mode_ops + aten_div_mode_ops):
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_elu_pass.py b/backends/arm/_passes/decompose_elu_pass.py
index 743f1b46f4d..ba3d32b7529 100644
--- a/backends/arm/_passes/decompose_elu_pass.py
+++ b/backends/arm/_passes/decompose_elu_pass.py
@@ -3,8 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 edge_elu_ops = (exir_ops.edge.aten.elu.default,)
 
@@ -55,6 +58,8 @@ class DecomposeEluPass(ArmPass):
         - exir_ops.edge.aten.mul.Scalar
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in edge_elu_ops:
             return super().call_operator(op, args, kwargs, meta, updated=False)
diff --git a/backends/arm/_passes/decompose_embedding_pass.py b/backends/arm/_passes/decompose_embedding_pass.py
index 6de971f402f..01226a7a38e 100644
--- a/backends/arm/_passes/decompose_embedding_pass.py
+++ b/backends/arm/_passes/decompose_embedding_pass.py
@@ -8,8 +8,10 @@
 
 import logging
 from math import prod
+from typing import Set, Type
 
 import torch
+from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -33,6 +35,8 @@ class DecomposeEmbeddingPass(ExportPass):
          i = indices is expected to be int32 before this pass
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {FuseViewCopyTransform}
+
     aten_ops = (torch.ops.aten.embedding.default,)
     edge_ops = (exir_ops.edge.aten.embedding.default,)
 
diff --git a/backends/arm/_passes/decompose_expm1_pass.py b/backends/arm/_passes/decompose_expm1_pass.py
index 5b1b90495b5..5de03cbf102 100644
--- a/backends/arm/_passes/decompose_expm1_pass.py
+++ b/backends/arm/_passes/decompose_expm1_pass.py
@@ -3,8 +3,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.convert_int_pow_to_mul import ConvertIntPowToMuls
+from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 edge_expm1_ops = (exir_ops.edge.aten.expm1.default,)  # MI case
@@ -68,6 +79,15 @@ class DecomposeExpm1Pass(ArmPass):
         - exir_ops.edge.aten.logical_and.default
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ConvertIntPowToMuls,
+        InsertTableOpsPass,
+        DecomposeDivPass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+        MatchArgDtypePass,
+        MatchArgRanksPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in edge_expm1_ops:
             return super().call_operator(op, args, kwargs, meta, updated=False)
diff --git a/backends/arm/_passes/decompose_gelu_pass.py b/backends/arm/_passes/decompose_gelu_pass.py
index 6e72175e68b..237b8199e82 100644
--- a/backends/arm/_passes/decompose_gelu_pass.py
+++ b/backends/arm/_passes/decompose_gelu_pass.py
@@ -3,8 +3,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -77,6 +83,13 @@ class DecomposeGeluPass(ExportPass):
         %op7 = mul(%op6, %FULL_0_5)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ComputeConstantOpsAOT,
+        InsertTableOpsPass,
+        MatchArgDtypePass,
+        MatchArgRanksPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in torch_gelu + edge_gelu:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_glu_pass.py b/backends/arm/_passes/decompose_glu_pass.py
index 183dc89cf61..373b31c5995 100644
--- a/backends/arm/_passes/decompose_glu_pass.py
+++ b/backends/arm/_passes/decompose_glu_pass.py
@@ -3,9 +3,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 # For FP case
@@ -36,6 +40,8 @@ def get_ops(op):
 class DecomposeGluPass(ArmPass):
     """Decomposes the GLU operator into hadamard product and sigmoid."""
 
+    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in [edge_glu, aten_glu]:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_grouped_conv.py b/backends/arm/_passes/decompose_grouped_conv.py
index ce9fe9c9937..916e43ee9a4 100644
--- a/backends/arm/_passes/decompose_grouped_conv.py
+++ b/backends/arm/_passes/decompose_grouped_conv.py
@@ -4,8 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 from copy import copy
+from typing import Set, Type
 
 import torch
+from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -33,6 +35,8 @@ class DecomposeGroupedConv(ExportPass):
         x = cat(x1, x2)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {Conv1dUnsqueezePass}
+
     @staticmethod
     def _get_decomposition(op):
         match op:
diff --git a/backends/arm/_passes/decompose_groupnorm_pass.py b/backends/arm/_passes/decompose_groupnorm_pass.py
index c6cb1b05e40..29d68234b29 100644
--- a/backends/arm/_passes/decompose_groupnorm_pass.py
+++ b/backends/arm/_passes/decompose_groupnorm_pass.py
@@ -6,12 +6,17 @@
 # pyre-unsafe
 
 import operator
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 
 
 def get_group_norm_decomposition(op) -> tuple:
@@ -57,6 +62,13 @@ class DecomposeGroupNormPass(ArmPass):
     Source: https://pytorch.org/docs/stable/generated/torch.nn.GroupNorm.html
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        InsertTableOpsPass,
+        DecomposeMeanDimPass,
+        DecomposeVarPass,
+        SizeAdjustInputPass,
+    }
+
     def call(self, graph_module: torch.fx.GraphModule):
         modified = False
         for node in graph_module.graph.nodes:
diff --git a/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py b/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py
new file mode 100644
index 00000000000..d43c2a8c89c
--- /dev/null
+++ b/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py
@@ -0,0 +1,145 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import cast
+
+import torch
+from executorch.backends.arm._passes.quant_args import QuantArgs
+
+from executorch.backends.arm.tosa.specification import get_context_spec, Tosa_1_00
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+
+class DecomposeConv2dWithInt16ActivationPass(ExportPass):
+    """
+    This pass decomposes a convolution with input dtype int16 and bias
+    into a convolution without bias followed by an addition of the bias
+    since the TOSA op requires the bias to be int48 which is hard to represent
+    in torch. Instead rescale the int48 output to int16 and add the bias in int16.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op != exir_ops.edge.aten.convolution.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        tosa_spec = get_context_spec()
+        if not tosa_spec.support_integer():
+            return super().call_operator(op, args, kwargs, meta)
+
+        # return if no bias
+        if args[2] is None:
+            return super().call_operator(op, args, kwargs, meta)
+
+        if args[0].data.dtype == torch.int8:
+            return super().call_operator(op, args, kwargs, meta)
+        elif args[0].data.dtype == torch.int16:
+            if isinstance(tosa_spec, Tosa_1_00) and not tosa_spec.support_extension(
+                "int16"
+            ):
+                raise ValueError(
+                    "int16 activation for convolution requires TOSA int16 extension"
+                )
+        else:
+            raise NotImplementedError(
+                "Decomposition to conv+add only implemented for activation of int16 type"
+            )
+
+        # convolution with bias and activation is int16
+        # The bias is assumed to be quantized with the same quantization parameters as
+        # as the output of the convolution
+        bias = args[2]
+        assert (
+            meta.data["output_qparams"][0].dtype == bias.data.dtype
+        ), "Bias needs to have same type as quantized output type"
+        no_bias_args = list(args)
+        no_bias_args[2] = None
+        # split up to convolution + bias
+        convolution = super().call_operator(op, tuple(no_bias_args), kwargs, meta)
+
+        # create a copy of the meta without the qparams, to be used with the new nodes
+        new_meta = meta.copy()
+        new_meta.data.pop("output_qparams", None)
+        new_meta.data.pop("input_qparams", None)
+
+        # reshape the tensor to the same rank as the convolution output to add the bias to the channels
+        channel_bias = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (bias, [1, len(bias.data), 1, 1]),
+            {},
+            new_meta,
+        )
+
+        output_dtype = meta.data["output_qparams"][0].dtype
+
+        if output_dtype == torch.int16:
+            # The conv will get the output int48 scaled to int32 in serialization step.
+            # To be able to add the bias we need to first scale (cast?) the output to int32.
+            # The resulting i32 sum will then need to be scaled back to the output dtype.
+
+            # calculate common rescale factor from convolution output and bias quantization
+            output_qparams = cast(QuantArgs, meta.data["output_qparams"][0])
+            conv_output_scale = output_qparams.scale
+            bias_qparams = cast(QuantArgs, meta.data["input_qparams"][2])
+            bias_scale = bias_qparams.scale
+
+            common_scale = max(bias_scale, conv_output_scale)
+
+            # calculate how we can rescale bias and conv to a common scale and maximize the output range
+            bias_rescale_factor = bias_scale / common_scale
+            conv_rescale_factor = conv_output_scale / common_scale
+
+            # Either of conv output or bias now covers the full int16 range and the other one a smaller range.
+            # Since we are upscaling to int32 we have 16 additional bits to work with to maximize the output range.
+            # Worst case here is that both bias and conv output covers the full int16 range so we leave one bit
+            # and then one for the sign bit.
+            bits_left_to_shift = 14
+
+            # update rescale factors
+            bias_rescale_factor *= 1 << bits_left_to_shift
+            conv_rescale_factor *= 1 << bits_left_to_shift
+
+            conv_output = super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (convolution, torch.int32, conv_rescale_factor, 0, 0),
+                {},
+                new_meta,
+            )
+
+            bias_rescaled = super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (channel_bias, torch.int32, bias_rescale_factor, 0, 0),
+                {},
+                new_meta,
+            )
+
+            add = super().call_operator(
+                exir_ops.edge.aten.add.Tensor,
+                (conv_output, bias_rescaled),
+                {},
+                new_meta,
+            )
+
+            res_rescale = super().call_operator(
+                exir_ops.backend.tosa.RESCALE.default,
+                (
+                    add,
+                    output_dtype,
+                    (common_scale / (conv_output_scale * (1 << bits_left_to_shift))),
+                    0,
+                    0,
+                ),
+                {},
+                new_meta,
+            )
+
+        else:
+            raise NotImplementedError(
+                f"Decomposition to conv+add only implemented for activation of int16 type, not for {output_dtype}"
+            )
+
+        return res_rescale
diff --git a/backends/arm/_passes/decompose_layernorm_pass.py b/backends/arm/_passes/decompose_layernorm_pass.py
index e6cbdfb91a0..c73806b0022 100644
--- a/backends/arm/_passes/decompose_layernorm_pass.py
+++ b/backends/arm/_passes/decompose_layernorm_pass.py
@@ -6,12 +6,17 @@
 # pyre-unsafe
 
 import operator
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 
 
 def get_layer_norm_decomposition(op) -> tuple:
@@ -56,6 +61,13 @@ class DecomposeLayerNormPass(ArmPass):
     Source: https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ComputeConstantOpsAOT,
+        DecomposeMeanDimPass,
+        DecomposeVarPass,
+        InsertTableOpsPass,
+    }
+
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             if node.op != "call_function" or node.target not in (
diff --git a/backends/arm/_passes/decompose_leaky_relu_pass.py b/backends/arm/_passes/decompose_leaky_relu_pass.py
index e896cc584be..8ae13a76eb0 100644
--- a/backends/arm/_passes/decompose_leaky_relu_pass.py
+++ b/backends/arm/_passes/decompose_leaky_relu_pass.py
@@ -6,9 +6,12 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 edge_ops = (exir_ops.edge.aten.leaky_relu.default,)
 torch_ops = (torch.ops.aten.leaky_relu.default,)
@@ -46,6 +49,8 @@ class DecomposeLeakyReLUPass(ArmPass):
         %op5 = add(%op1,%op4)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (edge_ops + torch_ops):
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py
index 9f036c0524f..ea5dd2d9b55 100644
--- a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py
+++ b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py
@@ -3,7 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass
+from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
 from executorch.exir.pass_base import ExportPass
 
 
@@ -28,6 +32,11 @@ class DecomposeLinearVectorNormPass(ExportPass):
           dtype prior, but we dont know this from FX graph.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSqrtPass,
+        DecomposeSumPass,
+    }
+
     torch_linalg_vector_norm = (torch.ops.aten.linalg_vector_norm.default,)
 
     def call_operator(self, op, args, kwargs, meta):
diff --git a/backends/arm/_passes/decompose_linear_pass.py b/backends/arm/_passes/decompose_linear_pass.py
index 3d154d9b81e..70268c77a1d 100644
--- a/backends/arm/_passes/decompose_linear_pass.py
+++ b/backends/arm/_passes/decompose_linear_pass.py
@@ -5,6 +5,8 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import numpy as np
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import (
@@ -12,7 +14,7 @@
     get_first_fake_tensor,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import PassResult
+from executorch.exir.pass_base import ExportPass, PassResult
 
 
 class DecomposeLinearPass(ArmPass):
@@ -25,6 +27,8 @@ class DecomposeLinearPass(ArmPass):
         output           = view(conv2d)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call(self, graph_module):
         for node in graph_module.graph.nodes:
             if node.op != "call_function":
diff --git a/backends/arm/_passes/decompose_logit_pass.py b/backends/arm/_passes/decompose_logit_pass.py
index 40e2b22cb54..213b8f038e8 100644
--- a/backends/arm/_passes/decompose_logit_pass.py
+++ b/backends/arm/_passes/decompose_logit_pass.py
@@ -3,10 +3,19 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 # For FP case
@@ -60,6 +69,13 @@ class DecomposeLogitPass(ArmPass):
             log(y * reciprocal((-1) * y + 1))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        MatchArgDtypePass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in [edge_logit, aten_logit]:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_masked_fill.py b/backends/arm/_passes/decompose_masked_fill.py
index fbf3079c92b..8c41c1a11bc 100644
--- a/backends/arm/_passes/decompose_masked_fill.py
+++ b/backends/arm/_passes/decompose_masked_fill.py
@@ -6,10 +6,16 @@
 # pyre-unsafe
 
 
+from typing import Set, Type
+
 import torch
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.convert_full_like_to_full_pass import (
+    ConvertFullLikeToFullPass,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 edge_ops = (exir_ops.edge.aten.masked_fill.Scalar,)
@@ -37,6 +43,8 @@ class DecomposeMaskedFill(ArmPass):
     Decomposed to a where and a full_like operator.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {ConvertFullLikeToFullPass}
+
     def call_operator(self, op, args, kwargs, meta, updated=False):
         if op not in (edge_ops + aten_ops):
             return super().call_operator(op, args, kwargs, meta, updated)
diff --git a/backends/arm/_passes/decompose_maxpool2d_with_dilation.py b/backends/arm/_passes/decompose_maxpool2d_with_dilation.py
index ff6db260099..22d2ec1d85b 100644
--- a/backends/arm/_passes/decompose_maxpool2d_with_dilation.py
+++ b/backends/arm/_passes/decompose_maxpool2d_with_dilation.py
@@ -6,9 +6,12 @@
 # pyre-unsafe
 
 import operator
+from typing import Set, Type
 
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 # We'll decompose only the EXIR edge max_pool2d ops when dilation > 1
 EDGE_MAXPOOL2D = (
@@ -22,6 +25,10 @@ class DecomposeMaxPool2DPass(ArmPass):
     Decompose dilated max_pool2d (EXIR edge ops) into space-to-batch -> maxpool -> batch-to-space.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        SizeAdjustInputPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         # Only intercept EXIR edge max_pool2d ops
         if op not in EDGE_MAXPOOL2D:
diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py
index a78514b6af5..4d4c0ee75b1 100644
--- a/backends/arm/_passes/decompose_meandim_pass.py
+++ b/backends/arm/_passes/decompose_meandim_pass.py
@@ -5,12 +5,17 @@
 
 from copy import copy
 from math import prod
+from typing import Set, Type
 
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
+from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
+from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass
 from executorch.exir.backend.utils import WhyNoPartitionReporter
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 def get_meandim_decomposition(op) -> tuple:
@@ -62,6 +67,12 @@ class DecomposeMeanDimPass(ArmPass):
         x = view_copy.default(x, new_shape=(h)) # Squeeze dims since keepdims = False
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ComputeConstantOpsAOT,
+        DecomposeSumPass,
+        SizeAdjustInputPass,
+    }
+
     def __init__(self, graph_module, tosa_spec):
         super().__init__()
         self._graph_module = graph_module
@@ -83,6 +94,8 @@ def call_operator(self, op, args, kwargs, meta):
         input_shape = list(x.data.shape)
         output_shape = list(meta["val"].shape)
         dims_to_reduce = get_node_arg(args, 1)
+        if dims_to_reduce is None:
+            dims_to_reduce = range(len(input_shape))
         dims_to_reduce = [dim % len(input_shape) for dim in dims_to_reduce]
         dims_to_reduce = [dim for dim in dims_to_reduce if input_shape[dim] != 1]
 
diff --git a/backends/arm/_passes/decompose_ne_pass.py b/backends/arm/_passes/decompose_ne_pass.py
index 16443d5d2fb..3bd4f4540bb 100644
--- a/backends/arm/_passes/decompose_ne_pass.py
+++ b/backends/arm/_passes/decompose_ne_pass.py
@@ -3,9 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 edge_ne_ops = (exir_ops.edge.aten.ne.Tensor,)
 aten_ne_ops = (torch.ops.aten.ne.Tensor, torch.ops.aten.ne_.Tensor)
@@ -53,6 +56,8 @@ class DecomposeNotEqualPass(ArmPass):
         - followed by aten.logical_not.default or its edge equivalent
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (edge_ne_ops + aten_ne_ops):
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_round_pass.py b/backends/arm/_passes/decompose_round_pass.py
index edfa3817064..35d36e80396 100644
--- a/backends/arm/_passes/decompose_round_pass.py
+++ b/backends/arm/_passes/decompose_round_pass.py
@@ -3,10 +3,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass
 from torch._ops import OpOverload
 
 
@@ -56,6 +59,8 @@ class DecomposeRoundPass(ArmPass):
         %result = where(%is_non_negative, %floor, %ceil)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta, updated=False):
         if op not in (exir_ops.edge.aten.round.default, torch.ops.aten.round.default):
             return super().call_operator(op, args, kwargs, meta, updated)
diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py
index 99c89f474ea..049409af6fd 100644
--- a/backends/arm/_passes/decompose_select.py
+++ b/backends/arm/_passes/decompose_select.py
@@ -6,11 +6,16 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
 )
+from executorch.backends.arm._passes.convert_squeezes_to_view import (
+    ConvertSqueezesToViewPass,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -20,6 +25,8 @@ class DecomposeSelectPass(ExportPass):
     This pass decomposes select into slice + squeeze to ensure that Aten and TOSA outputs has the same rank (input rank -1)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {ConvertSqueezesToViewPass}
+
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
 
diff --git a/backends/arm/_passes/decompose_sign_pass.py b/backends/arm/_passes/decompose_sign_pass.py
index 1038ff0f3fa..c4cb964316d 100644
--- a/backends/arm/_passes/decompose_sign_pass.py
+++ b/backends/arm/_passes/decompose_sign_pass.py
@@ -3,10 +3,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 
 from executorch.backends.arm._passes import ArmPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 # For MI case
@@ -42,6 +45,8 @@ def get_ops(op):
 class DecomposeSignPass(ArmPass):
     """Decomposes the sign operator into a sequence of operations that are supported by the Arm backend."""
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (edge_sign, aten_sign):
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_silu_pass.py b/backends/arm/_passes/decompose_silu_pass.py
index 68ebb3f4515..3d31552cf35 100644
--- a/backends/arm/_passes/decompose_silu_pass.py
+++ b/backends/arm/_passes/decompose_silu_pass.py
@@ -5,7 +5,10 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.pass_base import ExportPass
 
 aten_silu_ops = (torch.ops.aten.silu.default, torch.ops.aten.silu_.default)
@@ -22,6 +25,8 @@ class DecomposeSiluPass(ExportPass):
         y = mul(a,x)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (aten_silu_ops):
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_sinh_pass.py b/backends/arm/_passes/decompose_sinh_pass.py
index 7192eb9bf74..acb18df3134 100644
--- a/backends/arm/_passes/decompose_sinh_pass.py
+++ b/backends/arm/_passes/decompose_sinh_pass.py
@@ -4,8 +4,17 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from typing import Set, Type
+
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
+from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
+from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import (
+    ReplaceScalarWithTensorArgPassTOSAMI,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 # For MI case
@@ -24,6 +33,13 @@ class DecomposeSinhPass(ArmPass):
         and scalar multiplication.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        InsertTableOpsPass,
+        MatchArgRanksPass,
+        ReplaceScalarWithTensorArgPassTOSAMI,
+        MatchArgDtypePass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op is not edge_sinh:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_softmax_pass.py b/backends/arm/_passes/decompose_softmax_pass.py
index a735501f711..52df7cf6700 100644
--- a/backends/arm/_passes/decompose_softmax_pass.py
+++ b/backends/arm/_passes/decompose_softmax_pass.py
@@ -3,7 +3,11 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -62,6 +66,11 @@ class DecomposeSoftmaxPass(ExportPass):
         (in logsoftmax case: %op7 = log(%op6))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSumPass,
+        InsertTableOpsPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in torch_softmax + edge_softmax:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_softmax_unstable_pass.py b/backends/arm/_passes/decompose_softmax_unstable_pass.py
index b6f5e11b66b..04e99a46b3e 100644
--- a/backends/arm/_passes/decompose_softmax_unstable_pass.py
+++ b/backends/arm/_passes/decompose_softmax_unstable_pass.py
@@ -5,9 +5,14 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 # For BI case
 torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
@@ -57,6 +62,11 @@ class DecomposeSoftmaxUnstablePass(ArmPass):
         (in logsoftmax case: %op5 = log(%op4))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        DecomposeSumPass,
+        InsertTableOpsPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in torch_softmax + edge_softmax:
             return super().call_operator(op, args, kwargs, meta)
diff --git a/backends/arm/_passes/decompose_sqrt_pass.py b/backends/arm/_passes/decompose_sqrt_pass.py
index 547d0091e90..3f4e608c4b9 100644
--- a/backends/arm/_passes/decompose_sqrt_pass.py
+++ b/backends/arm/_passes/decompose_sqrt_pass.py
@@ -4,9 +4,10 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-from typing import Tuple, Union
+from typing import Set, Tuple, Type, Union
 
 import torch
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -27,6 +28,7 @@ def get_sqrt_decomposition(op) -> Union[Tuple, torch._ops.OpOverload]:
 
 
 class DecomposeSqrtPass(ExportPass):
+    _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass}
 
     def call_operator(self, op, args, kwargs, meta):
         """
diff --git a/backends/arm/_passes/decompose_sum_pass.py b/backends/arm/_passes/decompose_sum_pass.py
index 52b9c10c49f..16027ccec2b 100644
--- a/backends/arm/_passes/decompose_sum_pass.py
+++ b/backends/arm/_passes/decompose_sum_pass.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -40,6 +42,8 @@ class DecomposeSumPass(ExportPass):
         view(shape = squeezed_shape) -> squeezed_shape
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in [
             exir_ops.edge.aten.sum.dim_IntList,
diff --git a/backends/arm/_passes/decompose_var_pass.py b/backends/arm/_passes/decompose_var_pass.py
index 15872738f3e..db5d820ac70 100644
--- a/backends/arm/_passes/decompose_var_pass.py
+++ b/backends/arm/_passes/decompose_var_pass.py
@@ -7,10 +7,16 @@
 # pyre-unsafe
 
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
+from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass
+from executorch.backends.arm._passes.fuse_constant_ops_pass import ComputeConstantOpsAOT
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 def get_var_decomposition(op) -> tuple:
@@ -47,6 +53,12 @@ class DecomposeVarPass(ArmPass):
         y = div(sum, max(0, N-correction))
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ComputeConstantOpsAOT,
+        DecomposeMeanDimPass,
+        DecomposeSumPass,
+    }
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (
             exir_ops.edge.aten.var.correction,
diff --git a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py
index 17a682c0a8e..9d704520302 100644
--- a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py
+++ b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py
@@ -6,10 +6,13 @@
 # pyre-unsafe
 
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
 
 
 def _get_decorated_ops(op):
@@ -40,6 +43,8 @@ class DecorateFp32toInt32CastingPass(ArmPass):
         output = to_dim_order_copy(decorated_x, dtype=torch.int32)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     targets = [
         exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
     ]
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
index 491b404f0a4..477e007b8bf 100644
--- a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
+++ b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -8,15 +8,17 @@
 
 import copy
 
-from typing import cast, Dict, Set, Tuple
+from typing import cast, Dict, Set, Tuple, Type
 
 from executorch.backends.arm._passes import ArmPass
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_param_tensor,
     is_param_node,
 )
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 
 from executorch.backends.arm._passes.quant_args import QuantArgs
+from executorch.backends.arm._passes.remove_noop_pass import RemoveNoopPass
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -70,6 +72,44 @@ def get_output_qparams(node: Node) -> dict[int, QuantArgs]:
     return output_qparams
 
 
+class RetraceFoldedDtypesPass(ExportPass):
+    """
+    FoldAndAnnotateQParamsPass folds dq and q nodes. When the graph is retraced
+    some operators are retraced to types that cannot be handled by TOSA. One
+    such example is sum.dim_IntList:
+        q (int8) -> dq (fp32) -> sum (fp32) -> q (int8) ...
+    After folding it becomes:
+        q (int8)              -> sum (int64) ->         ...
+    This pass changes types of ops in self.targeted_ops, such as sum, so that
+    the output type of that matches the type of the output_qparams.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    targeted_ops: Set[EdgeOpOverload] = {
+        exir_ops.edge.aten.sum.dim_IntList,
+    }
+
+    def call_operator(
+        self,
+        op,  # pyre-ignore
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in self.targeted_ops:
+            return super().call_operator(op, args, kwargs, meta)
+
+        node_kwargs = kwargs.copy()
+        output_qparams = meta["output_qparams"]
+        if len(output_qparams) == 0:
+            return super().call_operator(op, args, kwargs, meta)
+
+        output_dtype = output_qparams[0].dtype
+        node_kwargs["dtype"] = output_dtype
+        return super().call_operator(op, args, node_kwargs, meta)
+
+
 class FoldAndAnnotateQParamsPass(ArmPass):
     """
     A pass that walks the graph and removes any DQ and Q nodes before and after the target
@@ -100,6 +140,12 @@ class FoldAndAnnotateQParamsPass(ArmPass):
 
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        RetraceFoldedDtypesPass,
+        InsertTableOpsPass,
+        RemoveNoopPass,
+    }
+
     def fold_and_annotate_arg(
         self, graph_module: GraphModule, node: Node, arg_list: list[Node], i: int
     ) -> None:
@@ -210,6 +256,8 @@ class QuantizeOperatorArguments(ExportPass):
         - Makes sure the min and max values to clamp.default are quantized, if it's a quantized operator.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {FoldAndAnnotateQParamsPass}
+
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
         # Loop over the graph nodes and find full.default nodes.
@@ -243,39 +291,3 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 modified = True
 
         return PassResult(graph_module, modified)
-
-
-class RetraceFoldedDtypesPass(ExportPass):
-    """
-    FoldAndAnnotateQParamsPass folds dq and q nodes. When the graph is retraced
-    some operators are retraced to types that cannot be handled by TOSA. One
-    such example is sum.dim_IntList:
-        q (int8) -> dq (fp32) -> sum (fp32) -> q (int8) ...
-    After folding it becomes:
-        q (int8)              -> sum (int64) ->         ...
-    This pass changes types of ops in self.targeted_ops, such as sum, so that
-    the output type of that matches the type of the output_qparams.
-    """
-
-    targeted_ops: Set[EdgeOpOverload] = {
-        exir_ops.edge.aten.sum.dim_IntList,
-    }
-
-    def call_operator(
-        self,
-        op,  # pyre-ignore
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        if op not in self.targeted_ops:
-            return super().call_operator(op, args, kwargs, meta)
-
-        node_kwargs = kwargs.copy()
-        output_qparams = meta["output_qparams"]
-        if len(output_qparams) == 0:
-            return super().call_operator(op, args, kwargs, meta)
-
-        output_dtype = output_qparams[0].dtype
-        node_kwargs["dtype"] = output_dtype
-        return super().call_operator(op, args, node_kwargs, meta)
diff --git a/backends/arm/_passes/fuse_batchnorm2d_pass.py b/backends/arm/_passes/fuse_batchnorm2d_pass.py
index 2dbdfa84cec..8be6b61d25c 100644
--- a/backends/arm/_passes/fuse_batchnorm2d_pass.py
+++ b/backends/arm/_passes/fuse_batchnorm2d_pass.py
@@ -5,11 +5,14 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
 )
+from executorch.backends.arm.common.debug import get_node_debug_info
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     delete_constant_placeholder,
@@ -28,6 +31,8 @@ class FuseBatchnorm2DPass(ExportPass):
     the weights and bias of the convolution and removing the batchnorm.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram):
         self.exported_program = exported_program
         super().__init__()
@@ -56,8 +61,16 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
             input_node = node.all_input_nodes[0]
             is_single_user = len(input_node.users) == 1
             bn_weight_node, bn_bias_node, bn_mean_node, bn_var_node = node.args[1:5]
-            assert bn_mean_node is not None, "Batchnorm mean node cannot be None."
-            assert bn_var_node is not None, "Batchnorm var node cannot be None."
+            if bn_mean_node is None:
+                raise RuntimeError(
+                    "BatchNorm mean buffer missing for node: "
+                    f"{get_node_debug_info(node, graph_module)}"
+                )
+            if bn_var_node is None:
+                raise RuntimeError(
+                    "BatchNorm variance buffer missing for node: "
+                    f"{get_node_debug_info(node, graph_module)}"
+                )
 
             epsilon = node.args[-1]
 
@@ -129,14 +142,23 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:  # noqa: C901
                     input_node = new_input_node
             else:
                 input_weight_node, input_bias_node = input_node.args[1:3]
-                assert (
-                    isinstance(input_weight_node, Node)
-                    and input_weight_node.op == "placeholder"
-                ), "Parameter weight of convolution must be a placeholder"
-                assert (input_bias_node is None) or (
+                if not (
                     isinstance(input_weight_node, Node)
                     and input_weight_node.op == "placeholder"
-                ), "Parameter bias of convolution must be a placeholder or None"
+                ):
+                    raise RuntimeError(
+                        "Parameter weight of convolution must be a placeholder"
+                    )
+                if not (
+                    (input_bias_node is None)
+                    or (
+                        isinstance(input_weight_node, Node)
+                        and input_weight_node.op == "placeholder"
+                    )
+                ):
+                    raise RuntimeError(
+                        "Parameter bias of convolution must be a placeholder or None"
+                    )
 
                 input_weight_tensor = torch.Tensor(
                     get_param(self.exported_program, input_weight_node)
diff --git a/backends/arm/_passes/fuse_constant_ops_pass.py b/backends/arm/_passes/fuse_constant_ops_pass.py
index f49565e3c38..c48fc008b5d 100644
--- a/backends/arm/_passes/fuse_constant_ops_pass.py
+++ b/backends/arm/_passes/fuse_constant_ops_pass.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+from typing import Set, Type
 
 import torch._export.utils
 import torch.fx
@@ -13,6 +14,9 @@
     get_param_tensor,
     is_persistent_buffer,
 )
+from executorch.backends.arm._passes.fuse_equal_placeholders_pass import (
+    FuseEqualPlaceholdersPass,
+)
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     delete_constant_placeholder,
@@ -41,6 +45,8 @@ def f():
             return x
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program
@@ -108,8 +114,10 @@ def call(self, graph_module):
             if node.op != "call_function":
                 continue
             if node.target in [
-                exir_ops.backend.tosa.TABLE.default,
+                exir_ops.backend.tosa.MATMUL.default,
                 exir_ops.backend.tosa.RESCALE.default,
+                exir_ops.backend.tosa.RESIZE.default,
+                exir_ops.backend.tosa.TABLE.default,
                 exir_ops.backend.tosa.TRANSPOSE.default,
             ]:
                 continue
@@ -168,6 +176,8 @@ def f(node_name_pre_computed):
             return node_name_pre_computed
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {FuseEqualPlaceholdersPass}
+
     targeted_ops = [
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.arange.start_step,
diff --git a/backends/arm/_passes/fuse_equal_placeholders_pass.py b/backends/arm/_passes/fuse_equal_placeholders_pass.py
index 5631e2f32e9..b8b8143e6c5 100644
--- a/backends/arm/_passes/fuse_equal_placeholders_pass.py
+++ b/backends/arm/_passes/fuse_equal_placeholders_pass.py
@@ -5,13 +5,16 @@
 
 import hashlib
 from collections import defaultdict
+from typing import Set, Type
 
 import torch
+
 from executorch.backends.arm._passes.arm_pass_utils import (
     get_constant_placeholder_kind,
     get_param_tensor,
     is_param_node,
 )
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
     delete_constant_placeholder,
@@ -27,6 +30,8 @@ class FuseEqualPlaceholdersPass(ExportPass):
     with multiple users, using a cache for faster comparison.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram):
         self.exported_program = exported_program
         super().__init__()
@@ -44,9 +49,14 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 continue
             # Create a lightweight fingerprint: dtype + shape + SHA1 of raw bytes
             # Ensure tensor is on CPU and contiguous
+
+            # ensure we don't merge any special case int48_t tensors with int32_t tensors
+            # since int48_t tensors needs to be instantiated separately.
+            is_int48 = node.meta.get(TosaSpecialDtype.meta_key(), None)
             t_cpu = tensor.detach().cpu().contiguous()
             data_bytes = t_cpu.numpy().tobytes()
             key = (
+                is_int48,
                 str(t_cpu.dtype),
                 tuple(t_cpu.shape),
                 hashlib.sha1(data_bytes).hexdigest(),
diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
index 46a7d7f6f98..1076a3df658 100644
--- a/backends/arm/_passes/fuse_quantized_activation_pass.py
+++ b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -5,15 +5,28 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
+from executorch.backends.arm._passes.convert_to_clamp import ConvertToClampPass
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    FoldAndAnnotateQParamsPass,
+)
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import Q_OPS
+from executorch.backends.transforms.remove_getitem_op import RemoveGetItemPass
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import Node
 
 
 class FuseQuantizedActivationPass(ExportPass):
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ConvertToClampPass,
+        FoldAndAnnotateQParamsPass,
+        RemoveGetItemPass,
+    }
+
     @staticmethod
     def _is_fuseable_quantized_activation(node: Node):
         """Fuse activations that have a 0 lower bound and quantized with a qmin zero-point"""
diff --git a/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py
index 4b619af790c..c6e6f70a630 100644
--- a/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py
+++ b/backends/arm/_passes/insert_int32_casts_after_int64_placeholders.py
@@ -8,8 +8,13 @@
 
 import logging
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.decompose_embedding_pass import (
+    DecomposeEmbeddingPass,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import EdgeOpOverload, ExportPass, PassResult
 from torch._subclasses.fake_tensor import FakeTensor
@@ -26,6 +31,8 @@ class InsertInt32CastsAfterInt64PlaceholdersPass(ExportPass):
     the int32 range.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {DecomposeEmbeddingPass}
+
     # Ops that require i64 inputs → positions of args to upcast.
     # Key: op overload; Value: zero-based indices of positional args that must be i64.
     I64_INPUT_ARG_POSITIONS = {
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
index 7f75aecf24c..d56e70e78b3 100644
--- a/backends/arm/_passes/insert_rescales_pass.py
+++ b/backends/arm/_passes/insert_rescales_pass.py
@@ -4,9 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 from copy import copy
-from typing import cast
+from typing import cast, Dict, Optional, Set, Tuple, Type
 
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+import torch
+from executorch.backends.arm._passes.arm_pass import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import create_node, set_node_arg
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_output_qparams,
+)
 from executorch.backends.arm._passes.quant_args import QuantArgs
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -24,6 +29,8 @@ class InsertRescalePass(ExportPass):
     in the fake implementation of.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def fold_dq_q_to_rescale(self, node: Node, user: Node, graph_module: GraphModule):
         dq_args = QuantArgs.from_operator(node.target, node.args)
         q_args = QuantArgs.from_operator(user.target, user.args)
@@ -63,3 +70,234 @@ def call(self, graph_module: GraphModule) -> PassResult:
         graph_module = super().call(graph_module).graph_module
         graph_module.recompile()
         return PassResult(graph_module, modified)
+
+
+class InsertRescaleInt32Pass(ArmPass):
+    """
+    Numerous TOSA ops require inputs and outputs to be 32-bit integers in their
+    quantized implementations. This pass treats such operator nodes by
+    inserting rescale ops before and after them if needed. Note that extra logic
+    that handles the scales and zero points must be in place because the affected
+    TOSA have naive implementations that do not account for the quantization
+    parameters.
+    """
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    included_targets = [
+        exir_ops.edge.aten.abs.default,
+        exir_ops.edge.aten.eq.Tensor,
+        exir_ops.edge.aten.ge.Tensor,
+        exir_ops.edge.aten.gt.Tensor,
+        exir_ops.edge.aten.le.Tensor,
+        exir_ops.edge.aten.lt.Tensor,
+        exir_ops.edge.aten.maximum.default,
+        exir_ops.edge.aten.minimum.default,
+    ]
+
+    def _int32_qargs(self, s):
+        """Helper creator function for INT32-based QuantArgs"""
+
+        return QuantArgs(
+            scale=s,
+            zp=0,
+            qmin=torch.iinfo(torch.int32).min,
+            qmax=torch.iinfo(torch.int32).max,
+            dtype=torch.int32,
+        )
+
+    def _get_inputs_rescaled_qparams(
+        self, target, input_qparams: Dict[int, QuantArgs]
+    ) -> Dict[int, QuantArgs]:
+        """Get the qparams for the INT32 operands to the op ``target``
+
+        Inputs to the INT32-based operator must be rescaled from INT8 to INT32.
+        This function computes the ``QuantArgs`` for each of the operands and returns
+        it as a dict, mapping tensor index to ``QuantArgs``.
+        """
+
+        if target in [
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.le.Tensor,
+            exir_ops.edge.aten.lt.Tensor,
+            exir_ops.edge.aten.minimum.default,
+            exir_ops.edge.aten.maximum.default,
+        ]:
+            # For these ops, use the smallest scale among the INT8 operands.
+            min_scale = min(
+                [qp.get_scale_per_tensor() for qp in input_qparams.values()]
+            )
+            qparams = {
+                i: self._int32_qargs(min_scale) for i in range(len(input_qparams))
+            }
+        else:
+            raise ValueError(f"Not a valid target: {target}")
+
+        return qparams
+
+    def _get_output_qparams(
+        self, target, inputs_qparams: Dict[int, QuantArgs]
+    ) -> Optional[QuantArgs]:
+        """Given an op ``target`` and the ``QuantArgs`` for each of its inputs, compute
+        the scale of the output based on how the operator itself affects it."""
+
+        if target in [
+            exir_ops.edge.aten.abs.default,
+            exir_ops.edge.aten.maximum.default,
+            exir_ops.edge.aten.minimum.default,
+        ]:
+            # The op has not altered the scale; the output scale is equal to
+            # the operands' scales.
+            return self._int32_qargs(inputs_qparams[0].get_scale_per_tensor())
+        elif target in [
+            exir_ops.edge.aten.eq.Tensor,
+            exir_ops.edge.aten.ge.Tensor,
+            exir_ops.edge.aten.gt.Tensor,
+            exir_ops.edge.aten.le.Tensor,
+            exir_ops.edge.aten.lt.Tensor,
+        ]:
+            # Output is bool for these ops and thus no qparams are present
+            return None
+        else:
+            raise ValueError(f"Not a valid target: {target}")
+
+    def _get_rescale_qparams(
+        self, target, input_qparams: Dict[int, QuantArgs]
+    ) -> Tuple[Dict[int, QuantArgs], Optional[QuantArgs]]:
+        """
+        Get the quantization parameters of the INT32 inputs/outputs that will
+        surround the node after the new RESCALE ops have been inserted.
+        """
+
+        inputs_rescaled_qparams = self._get_inputs_rescaled_qparams(
+            target, input_qparams
+        )
+        output_qparams = self._get_output_qparams(target, inputs_rescaled_qparams)
+
+        return (inputs_rescaled_qparams, output_qparams)
+
+    def _rescale_inputs(self, graph, node, rescale_qargs: Dict[int, QuantArgs]) -> bool:
+        qargs = node.meta["input_qparams"]
+
+        args_copy = list(node.args)
+        seen_args = set()
+        modified = False
+        for i in qargs:
+            qp = qargs[i]
+            if qp.dtype != torch.int8:
+                continue
+
+            arg_node = args_copy[i]
+            if arg_node in seen_args:
+                continue
+            seen_args.add(arg_node)
+
+            with graph.inserting_after(arg_node):
+                rescale_node = create_node(
+                    graph,
+                    exir_ops.backend.tosa.RESCALE.default,
+                    (
+                        arg_node,
+                        torch.int32,
+                        qp.get_scale_per_tensor()
+                        / rescale_qargs[
+                            i
+                        ].get_scale_per_tensor(),  # Old scale / new scale
+                        qp.get_zp_per_tensor(),  # Old zero point
+                        rescale_qargs[i].get_zp_per_tensor(),  # New zero point
+                    ),
+                    from_node=node,
+                )
+
+                node.replace_input_with(arg_node, rescale_node)
+                modified = True
+
+        return modified
+
+    def _rescale_outputs(self, graph, node, rescale_qargs: Optional[QuantArgs]) -> bool:
+        if "output_qparams" not in node.meta or len(node.meta["output_qparams"]) == 0:
+            return False
+
+        qargs = get_output_qparams(node)
+        assert len(qargs) == 1
+        assert rescale_qargs is not None
+
+        qarg = qargs[0]
+        if qarg.dtype != torch.int8:
+            return False
+
+        users_copy = list(node.users)
+
+        with graph.inserting_after(node):
+            rescale_node = create_node(
+                graph,
+                exir_ops.backend.tosa.RESCALE.default,
+                (
+                    node,
+                    torch.int8,
+                    rescale_qargs.get_scale_per_tensor()
+                    / qarg.get_scale_per_tensor(),  # Old scale / new scale
+                    rescale_qargs.get_zp_per_tensor(),  # Old zero point
+                    qarg.get_zp_per_tensor(),  # New zero point
+                ),
+                from_node=node,
+            )
+
+        for user in users_copy:
+            user.replace_input_with(node, rescale_node)
+
+        return True
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        graph = graph_module.graph
+
+        modified = False
+        for node in list(graph.nodes):
+            node = cast(Node, node)
+
+            if node.op != "call_function" or node.target not in self.included_targets:
+                continue
+
+            if "input_qparams" not in node.meta or len(node.meta["input_qparams"]) == 0:
+                continue
+            input_qparams = node.meta["input_qparams"]
+
+            inputs_rescale_qargs, output_rescale_qargs = self._get_rescale_qparams(
+                node.target, input_qparams
+            )
+
+            inputs_was_rescaled = self._rescale_inputs(
+                graph, node, inputs_rescale_qargs
+            )
+            outputs_was_rescaled = False
+            if inputs_was_rescaled:
+                outputs_was_rescaled = self._rescale_outputs(
+                    graph, node, output_rescale_qargs
+                )
+                modified = True
+
+            # Update node metadata
+
+            if inputs_was_rescaled:
+                assert len(inputs_rescale_qargs) == len(node.meta["input_qparams"])
+                node.meta["input_qparams"] = inputs_rescale_qargs
+
+            if outputs_was_rescaled:
+                assert len(node.meta["output_qparams"]) == 1
+                node.meta["output_qparams"] = {0: output_rescale_qargs}
+
+                # If the output type is specified in the node, change it such
+                # that it matches the subsequent rescale node(s) that this node
+                # now has output edges to.
+                if "dtype" in node.kwargs:
+                    set_node_arg(node, "dtype", torch.int32)
+
+        if modified:
+            # Retrace the graph to update the fake tensor types
+            graph_module = super().call(graph_module).graph_module
+            graph_module.recompile()
+
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
index fb5d7de5e12..d838ddc823d 100644
--- a/backends/arm/_passes/insert_table_ops.py
+++ b/backends/arm/_passes/insert_table_ops.py
@@ -6,7 +6,7 @@
 # pyre-unsafe
 
 from itertools import chain
-from typing import Callable, cast, Dict, Iterator, Set
+from typing import Callable, cast, Dict, Iterator, Set, Type
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
@@ -117,6 +117,8 @@ class InsertTableOpsPass(ExportPass):
     which will be used to produce the table values in operators/op_table.py.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program
diff --git a/backends/arm/_passes/match_arg_dtype_pass.py b/backends/arm/_passes/match_arg_dtype_pass.py
index e7bf3b2d60e..d482614b03f 100644
--- a/backends/arm/_passes/match_arg_dtype_pass.py
+++ b/backends/arm/_passes/match_arg_dtype_pass.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node, get_node_arg
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -38,6 +40,8 @@ class MatchArgDtypePass(ExportPass):
 
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     targeted_ops = {exir_ops.edge.aten.sub.Tensor, exir_ops.edge.aten.where.self}
 
     def call(self, graph_module: torch.fx.GraphModule):
diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py
index d6cdfacb612..c411f3b8083 100644
--- a/backends/arm/_passes/match_arg_ranks_pass.py
+++ b/backends/arm/_passes/match_arg_ranks_pass.py
@@ -7,7 +7,7 @@
 
 # pyre-unsafe
 
-from typing import cast
+from typing import cast, Set, Type
 
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
@@ -36,6 +36,8 @@ class MatchArgRanksPass(ExportPass):
         input2 = shape(1, 3, 1)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program):
         super().__init__()
         self.exported_program = exported_program
diff --git a/backends/arm/_passes/mm_to_bmm_pass.py b/backends/arm/_passes/mm_to_bmm_pass.py
index 69d8573013e..c6f4786365d 100644
--- a/backends/arm/_passes/mm_to_bmm_pass.py
+++ b/backends/arm/_passes/mm_to_bmm_pass.py
@@ -6,12 +6,20 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
     insert_q_dq_pair,
 )
+from executorch.backends.arm._passes.convert_squeezes_to_view import (
+    ConvertSqueezesToViewPass,
+)
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    FoldAndAnnotateQParamsPass,
+)
 from executorch.backends.arm.constants import DQ_OPS, Q_OPS
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -28,6 +36,11 @@ class ConvertMmToBmmPass(ExportPass):
     3) Squeeze output tensor to rank 2.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {
+        ConvertSqueezesToViewPass,
+        FoldAndAnnotateQParamsPass,
+    }
+
     def call(self, graph_module: torch.fx.GraphModule):
         modified_graph = False
         graph = graph_module.graph
diff --git a/backends/arm/_passes/remove_noop_pass.py b/backends/arm/_passes/remove_noop_pass.py
index 623517aac59..55c4f71f0a8 100644
--- a/backends/arm/_passes/remove_noop_pass.py
+++ b/backends/arm/_passes/remove_noop_pass.py
@@ -7,6 +7,7 @@
 # pyre-unsafe
 
 import logging
+from typing import Set, Type
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
@@ -17,6 +18,8 @@
 class RemoveNoopPass(ExportPass):
     """Remove no-ops from graph_module"""
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call_operator(self, op, args, kwargs, meta):
         if op not in (
             exir_ops.edge.dim_order_ops._clone_dim_order.default,
diff --git a/backends/arm/_passes/replace_inf_values_pass.py b/backends/arm/_passes/replace_inf_values_pass.py
index 8c721eda3d8..506030d82d7 100644
--- a/backends/arm/_passes/replace_inf_values_pass.py
+++ b/backends/arm/_passes/replace_inf_values_pass.py
@@ -7,6 +7,8 @@
 # This pass is based on backends/qualcomm/_passes/replace_inf_values.py
 # with some modification to replaced inf values.
 
+from typing import Set, Type
+
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -16,6 +18,8 @@ class ReplaceInfValues(ExportPass):
     Due to limitation in Quantizer, we need to change inf/-inf to more quantizable values.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self):
         super(ReplaceInfValues, self).__init__()
 
diff --git a/backends/arm/_passes/replace_scalar_with_tensor_pass.py b/backends/arm/_passes/replace_scalar_with_tensor_pass.py
index 249eb9ffd41..f6ef056f677 100644
--- a/backends/arm/_passes/replace_scalar_with_tensor_pass.py
+++ b/backends/arm/_passes/replace_scalar_with_tensor_pass.py
@@ -6,7 +6,7 @@
 # pyre-unsafe
 
 
-from typing import Dict, Union
+from typing import Dict, Set, Type, Union
 
 import torch
 from executorch.backends.transforms.replace_scalar_with_tensor import (
@@ -15,6 +15,7 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass
 
 
 # Operators that are included for both TOSA profiles
@@ -56,6 +57,8 @@
 
 
 class ReplaceScalarWithTensorArgPassTOSAMI(ReplaceScalarWithTensorArgPass):
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     scalar_to_tensor_ops = _common_ops | {
         exir_ops.edge.aten.pow.Tensor_Scalar: exir_ops.edge.aten.pow.Tensor_Tensor,
         torch.ops.aten.pow.Tensor_Scalar: torch.ops.aten.pow.Tensor_Tensor,
@@ -66,6 +69,8 @@ def __init__(self):
 
 
 class ReplaceScalarWithTensorArgPassTOSABI(ReplaceScalarWithTensorArgPass):
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     scalar_to_tensor_ops = _common_ops
 
     def __init__(self):
diff --git a/backends/arm/_passes/rewrite_matmul.py b/backends/arm/_passes/rewrite_matmul.py
new file mode 100644
index 00000000000..28ff800792b
--- /dev/null
+++ b/backends/arm/_passes/rewrite_matmul.py
@@ -0,0 +1,97 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    get_input_qparams,
+    get_output_qparams,
+)
+from executorch.backends.arm.tosa.mapping import TosaSpecialDtype
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class RewriteMatmulPass(ArmPass):
+    """Rewrites aten.bmm to tosa.MATMUL and inserts a tosa.RESCALE op if needed."""
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def _insert_output_rescale(self, graph_module, node, tosa_matmul_node, dtype):
+        input_qparams = get_input_qparams(node)
+        output_qparams = get_output_qparams(node)[0]
+        scale = (
+            input_qparams[0].get_scale_per_tensor()
+            * input_qparams[1].get_scale_per_tensor()
+        ) / output_qparams.get_scale_per_tensor()
+
+        with graph_module.graph.inserting_after(tosa_matmul_node):
+            # If the input is int8, we need to cast the output to int32
+            rescale_node = create_node(
+                graph_module.graph,
+                op_target=exir_ops.backend.tosa.RESCALE.default,
+                from_node=tosa_matmul_node,
+            )
+            tosa_matmul_node.replace_all_uses_with(rescale_node)
+            rescale_node.args = (
+                tosa_matmul_node,
+                dtype,
+                scale,
+                0,
+                output_qparams.get_zp_per_tensor(),
+            )
+
+    def call(self, graph_module):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if (
+                node.op != "call_function"
+                or node.target != exir_ops.edge.aten.bmm.default
+            ):
+                continue
+            modified = True
+
+            x1, x2 = node.args
+            tosa_matmul_target = exir_ops.backend.tosa.MATMUL.default
+            with graph_module.graph.inserting_before(node):
+                tosa_matmul_node = create_node(
+                    graph_module.graph,
+                    op_target=tosa_matmul_target,
+                    args=(x1, x2),
+                    kwargs={},
+                    from_node=node,
+                )
+                node.replace_all_uses_with(tosa_matmul_node)
+                graph_module.graph.erase_node(node)
+
+            x1_fake_tensor = get_first_fake_tensor(x1)
+            x2_fake_tensor = get_first_fake_tensor(x2)
+            output_fake_tensor = tosa_matmul_target(x1_fake_tensor, x2_fake_tensor)
+            node_output_fake_tensor = get_first_fake_tensor(node)
+            if (
+                output_fake_tensor.dtype == torch.int32
+                and node_output_fake_tensor.dtype in (torch.int8, torch.int16)
+            ):
+                self._insert_output_rescale(
+                    graph_module,
+                    node,
+                    tosa_matmul_node,
+                    dtype=node_output_fake_tensor.dtype,
+                )
+                if x1_fake_tensor.dtype == torch.int16:
+                    tosa_matmul_node.meta[TosaSpecialDtype.meta_key()] = (
+                        TosaSpecialDtype.INT48
+                    )
+
+        if modified:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/rewrite_upsample.py b/backends/arm/_passes/rewrite_upsample.py
new file mode 100644
index 00000000000..c9f25a1e845
--- /dev/null
+++ b/backends/arm/_passes/rewrite_upsample.py
@@ -0,0 +1,84 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Set, Type
+
+import torch
+from executorch.backends.arm._passes import ArmPass
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+from executorch.backends.arm.tosa.utils import get_resize_parameters
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class RewriteUpsamplePass(ArmPass):
+    """Rewrite upsample2d nodes to TOSA.RESIZE nodes."""
+
+    targeted_ops = (
+        exir_ops.edge.aten.upsample_nearest2d.vec,
+        exir_ops.edge.aten.upsample_bilinear2d.vec,
+    )
+
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+    def call(self, graph_module):
+        modified = False
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+            modified = True
+
+            if node.target == exir_ops.edge.aten.upsample_bilinear2d.vec:
+                x, output_size, align_corners, scale_factors = node.args
+                resize_mode = "bilinear"
+            else:
+                x, output_size, scale_factors = node.args
+                align_corners = False
+                resize_mode = "nearest"
+
+            with graph_module.graph.inserting_before(node):
+                tosa_resize_node = create_node(
+                    graph_module.graph,
+                    op_target=exir_ops.backend.tosa.RESIZE.default,
+                    args=(x, output_size, align_corners, scale_factors),
+                    kwargs={"resize_mode": resize_mode},
+                    from_node=node,
+                )
+                node.replace_all_uses_with(tosa_resize_node)
+                graph_module.graph.erase_node(node)
+            input_dtype = get_first_fake_tensor(x).dtype
+            if input_dtype == torch.int8 and resize_mode == "bilinear":
+                input_size = get_first_fake_tensor(x).shape
+                input_size_xy = input_size[2:]
+                output_size = get_first_fake_tensor(node).shape
+                output_size_xy = output_size[2:]
+                scale_n_yx, _, _, _ = get_resize_parameters(
+                    input_size_xy=input_size_xy,
+                    output_size_xy=output_size_xy,
+                    resize_mode=1,
+                    align_corners=align_corners,
+                )
+                output_dtype = get_first_fake_tensor(node).dtype
+                output_scale = float(1 / (scale_n_yx[0] * scale_n_yx[1]))
+                with graph_module.graph.inserting_after(tosa_resize_node):
+                    rescale_node = create_node(
+                        graph_module.graph,
+                        exir_ops.backend.tosa.RESCALE.default,
+                    )
+                    tosa_resize_node.replace_all_uses_with(rescale_node)
+                    rescale_node.args = (
+                        tosa_resize_node,
+                        output_dtype,
+                        output_scale,
+                        0,  # zero point
+                        0,  # zero point
+                    )
+
+        if modified:
+            graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, modified)
diff --git a/backends/arm/_passes/scalars_to_attribute_pass.py b/backends/arm/_passes/scalars_to_attribute_pass.py
index 89468bff1ff..9ad3e318011 100644
--- a/backends/arm/_passes/scalars_to_attribute_pass.py
+++ b/backends/arm/_passes/scalars_to_attribute_pass.py
@@ -6,10 +6,11 @@
 
 # pyre-unsafe
 
-from typing import cast, Union
+from typing import cast, Set, Type, Union
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
 
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule, Node
@@ -22,6 +23,8 @@ class ScalarsToAttributePass(ExportPass):
     to attribute Nodes that output the same value.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = {MatchArgRanksPass}
+
     targeted_ops = [
         torch.ops.aten.add.Tensor,
         torch.ops.aten.add_.Tensor,
diff --git a/backends/arm/_passes/size_adjust_input_pass.py b/backends/arm/_passes/size_adjust_input_pass.py
index e87d65c450f..5eb77dc56df 100644
--- a/backends/arm/_passes/size_adjust_input_pass.py
+++ b/backends/arm/_passes/size_adjust_input_pass.py
@@ -5,7 +5,7 @@
 
 # pyre-unsafe
 
-from typing import cast, TypeAlias
+from typing import cast, Set, Type, TypeAlias
 
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import create_node
@@ -185,6 +185,8 @@ class SizeAdjustInputPass(ExportPass):
     input.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         graph = graph_module.graph
         modified_graph = False
diff --git a/backends/arm/_passes/to_tosa_memory_format_pass.py b/backends/arm/_passes/to_tosa_memory_format_pass.py
index e4436d638f4..b906c06b329 100644
--- a/backends/arm/_passes/to_tosa_memory_format_pass.py
+++ b/backends/arm/_passes/to_tosa_memory_format_pass.py
@@ -7,15 +7,29 @@
 
 
 import logging
+from typing import Set, Type
 
 import torch
-from executorch.backends.arm._passes import AnnotateOutputDimOrderPass
+from executorch.backends.arm._passes.annotate_decomposed_matmul import (
+    AnnotateDecomposedMatmulPass,
+)
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
-    get_output_dim_orders,
     is_param_node,
 )
+from executorch.backends.arm.constants import (
+    HWCM_ORDER,
+    NCHW_ORDER,
+    NHWC_INVERSE_ORDER,
+    NHWC_ORDER,
+    NNCHW_ORDER,
+    NNHWC_INVERSE_ORDER,
+    NNHWC_ORDER,
+    NNNCHW_ORDER,
+    NNNHWC_INVERSE_ORDER,
+    NNNHWC_ORDER,
+)
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -38,11 +52,7 @@ class ToTosaMemoryFormatPass(ExportPass):
     The annotated tosa_dim_order is used to permute the node's shape such that it gives a TOSA-compliant shape.
     """
 
-    NHWC_order = (0, 2, 3, 1)
-    NHWC_inverse_order = (0, 3, 1, 2)
-    HWCM_order = (2, 3, 0, 1)
-    NNHWC_order = (0, 1, 3, 4, 2)
-    NNHWC_inverse_order = (0, 1, 4, 2, 3)
+    _passes_required_after: Set[Type[ExportPass]] = set()
 
     def __init__(self, exported_program: ExportedProgram) -> None:
         self.exported_program = exported_program
@@ -80,7 +90,11 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
     @staticmethod
     def memory_format_differs(shape):
         """Returns true if the shape will have a different memory layout in (N)NCHW and (N)NHWC format"""
-        if len(shape) >= 5:
+        if len(shape) >= 6:
+            C = shape[3]
+            H = shape[4]
+            W = shape[5]
+        elif len(shape) == 5:
             C = shape[2]
             H = shape[3]
             W = shape[4]
@@ -99,25 +113,26 @@ def memory_format_differs(shape):
 
     @staticmethod
     def is_channel_reshape(input_shape, output_shape):
-        """Returns true if the reshape changes the channel dimension"""
-        if not (
-            (len(input_shape) == len(output_shape) and (len(output_shape) in (4, 5)))
-            or (len(input_shape) == 4 and len(output_shape) == 5)
-            or (len(input_shape) == 5 and len(output_shape) == 4)
-        ):
+        """Returns true if reshape changes the channel dimension or batch product dimension(s)"""
+
+        valid_ranks = {4, 5, 6}
+
+        if not (len(input_shape) in valid_ranks and len(output_shape) in valid_ranks):
             return False
 
         C_old = input_shape[-3]
         C_new = output_shape[-3]
 
-        N_new = (
-            output_shape[0]
-            if len(output_shape) == 4
-            else output_shape[0] * output_shape[1]
-        )
-        N_old = (
-            input_shape[0] if len(input_shape) == 4 else input_shape[0] * input_shape[1]
-        )
+        def get_batch_prod_dim(shape):
+            product = 1
+
+            for dim in shape[:-3]:
+                product = product * dim
+
+            return product
+
+        N_old = get_batch_prod_dim(input_shape)
+        N_new = get_batch_prod_dim(output_shape)
 
         return (N_old != N_new) or (C_old != C_new)
 
@@ -128,17 +143,27 @@ def insert_input_transpose(node, input_node, graph_module):
             node.replace_input_with(input_node, pre_permute_node)
             return
 
+        if len(get_first_fake_tensor(input_node).size()) == 6:
+            mem_format = NNNHWC_INVERSE_ORDER
+        elif len(get_first_fake_tensor(input_node).size()) == 5:
+            mem_format = NNHWC_INVERSE_ORDER
+        else:
+            mem_format = NHWC_INVERSE_ORDER
+        # Guard: mem_format must be a true permutation for the current rank
+        _rank_ = len(
+            get_first_fake_tensor(input_node).size()
+        )  # or (node) in output path
+        assert sorted(mem_format) == list(
+            range(_rank_)
+        ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
                 exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     input_node,
-                    list(
-                        ToTosaMemoryFormatPass.NNHWC_inverse_order
-                        if len(get_first_fake_tensor(input_node).size()) == 5
-                        else ToTosaMemoryFormatPass.NHWC_inverse_order
-                    ),
+                    list(mem_format),
                 ),
                 from_node=node,
             )
@@ -150,26 +175,38 @@ def insert_input_transpose(node, input_node, graph_module):
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
+
+        if len(get_first_fake_tensor(node).size()) == 6:
+            mem_format = NNNHWC_ORDER
+        elif len(get_first_fake_tensor(node).size()) == 5:
+            mem_format = NNHWC_ORDER
+        else:
+            mem_format = NHWC_ORDER
+        # Guard: mem_format must be a true permutation for the current rank
+        _rank_ = len(get_first_fake_tensor(node).size())  # or (node) in output path
+        assert sorted(mem_format) == list(
+            range(_rank_)
+        ), f"bad perm {mem_format} for rank {_rank_} in insert_input_transpose"
+
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
                 exir_ops.backend.tosa.TRANSPOSE.default,
                 args=(
                     node,
-                    list(
-                        ToTosaMemoryFormatPass.NNHWC_order
-                        if len(get_first_fake_tensor(node).size()) == 5
-                        else ToTosaMemoryFormatPass.NHWC_order
-                    ),
+                    list(mem_format),
                 ),
                 from_node=node,
             )
 
-            permute_node.meta["tosa_dim_order"] = (
-                ToTosaMemoryFormatPass.NNHWC_order
-                if len(get_first_fake_tensor(node).size()) == 5
-                else ToTosaMemoryFormatPass.NHWC_order
-            )
+            rank = len(get_first_fake_tensor(node).size())
+            if rank == 6:
+                permute_node.meta["tosa_dim_order"] = NNNHWC_ORDER
+            elif rank == 5:
+                permute_node.meta["tosa_dim_order"] = NNHWC_ORDER
+            else:
+                permute_node.meta["tosa_dim_order"] = NHWC_ORDER
+
             node.meta["tosa_dim_order"] = tuple(
                 range(len(get_first_fake_tensor(node).size()))
             )
@@ -218,7 +255,7 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             # call_function and placeholder allowed due to
             # index.Tensor being able to come in as both
-            if node.op not in ["call_function", "placeholder", "output"]:
+            if node.op != "call_function":
                 continue
 
             # Transpose views
@@ -240,21 +277,34 @@ def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
                     graph_module,
                 )
 
-            # Transpose inputs
-            elif _is_input(node, self.exported_program):
-                input_shape = get_first_fake_tensor(node).size()
-                if len(input_shape) in (4, 5):
-                    ToTosaMemoryFormatPass.insert_output_transpose(node, graph_module)
+        output_node = graph_module.graph.output_node()
 
-            # Transpose outputs
-            elif node.op == "output":
-                output_shape = get_first_fake_tensor(node).size()
+        # Transpose inputs if they are in (N)NCHW format
+        inputs = [
+            n for n in graph_module.graph.nodes if _is_input(n, self.exported_program)
+        ]
+        for input_node in inputs:
+            input_dim_order = get_first_fake_tensor(input_node).dim_order()
+            if input_dim_order in (NCHW_ORDER, NNCHW_ORDER, NNNCHW_ORDER):
+                self.insert_output_transpose(input_node, graph_module)
+
+        # Transpose outputs if they are in (N)NCHW format
+        outputs = output_node.args[0]
+        output_dim_orders = output_node.meta.get("original_dim_orders")
+        if output_dim_orders is None:
+            raise RuntimeError(
+                f"{AnnotateDecomposedMatmulPass.__name__} is required to run at the beginning of the pass pipeline when using {ToTosaMemoryFormatPass.__name__}."
+            )
 
-                if len(output_shape) in (4, 5):
-                    for input_node in node.all_input_nodes:
-                        ToTosaMemoryFormatPass.insert_input_transpose(
-                            node, input_node, graph_module
-                        )
+        for output_node_input, output_dim_order in zip(outputs, output_dim_orders):  # type: ignore[arg-type]
+            if output_dim_order in (
+                NCHW_ORDER,
+                NNCHW_ORDER,
+                NNNCHW_ORDER,
+            ):
+                self.insert_input_transpose(
+                    output_node, output_node_input, graph_module
+                )
 
     def remove_dim_order_kwargs(
         self, graph_module: torch.fx.GraphModule, node: torch.fx.Node
@@ -277,17 +327,19 @@ def call(self, graph_module: torch.fx.GraphModule):
             node_data = get_first_fake_tensor(node).data
 
             self.remove_dim_order_kwargs(graph_module, node)
-            # Inputs and outputs are always in (N)NCHW format
+            # Inputs and outputs may vary in dim_order
             if _is_input(node, self.exported_program) or node.op == "output":
-                dim_order = tuple(range(node_data.dim()))
+                dim_order = node_data.dim_order()
             elif node_data.dim() == 4:
-                dim_order = self.NHWC_order
+                dim_order = NHWC_ORDER
                 if self.is_weight_node_for_depthwise_conv2d(node):
                     # The weights of TOSA DEPTHWISE_CONV2D have shape (H, W, C, M) which corresponds to
                     # dim_order = (2, 3, 0, 1) (https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d).
-                    dim_order = self.HWCM_order
+                    dim_order = HWCM_ORDER
             elif node_data.dim() == 5:
-                dim_order = self.NNHWC_order
+                dim_order = NNHWC_ORDER
+            elif node_data.dim() == 6:
+                dim_order = NNNHWC_ORDER
             else:
                 dim_order = tuple(range(node_data.dim()))  # type: ignore[assignment]
 
@@ -300,32 +352,3 @@ def call(self, graph_module: torch.fx.GraphModule):
         graph_module = super().call(graph_module).graph_module
 
         return PassResult(graph_module, True)
-
-    def requires(self, graph_module) -> None:
-        """
-        This is the only pass which handles dim_orders, so verify that the output dim_orders has not changed since the beginning of the lowering pipeline.
-        """
-
-        dim_orders = get_output_dim_orders(graph_module)
-        original_dim_orders = graph_module.graph.output_node().meta.get(
-            "original_dim_orders"
-        )
-        output_node = graph_module.graph.output_node()
-
-        if original_dim_orders is None:
-            raise RuntimeError(
-                f"{AnnotateOutputDimOrderPass.__name__} must be run in the beginning of the pass pipeline to verify that the dim order has not changed unexpectedly during its run."
-            )
-
-        if len(dim_orders) != len(original_dim_orders):
-            raise RuntimeError(
-                f"The number of outputs has changed since {AnnotateOutputDimOrderPass.__name__} was run."
-            )
-
-        for node, dim_order, original_dim_order in zip(
-            output_node.args[0], dim_orders, original_dim_orders
-        ):
-            if dim_order != original_dim_order:
-                raise RuntimeError(
-                    f"The dim order of output {node.name} has changed from {original_dim_order} to {dim_order} since {AnnotateOutputDimOrderPass.__name__} was run."
-                )
diff --git a/backends/arm/_passes/unsqueeze_before_repeat_pass.py b/backends/arm/_passes/unsqueeze_before_repeat_pass.py
index 01983baa9ab..66286b6a954 100644
--- a/backends/arm/_passes/unsqueeze_before_repeat_pass.py
+++ b/backends/arm/_passes/unsqueeze_before_repeat_pass.py
@@ -1,9 +1,11 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 # pyre-unsafe
+from typing import Set, Type
+
 import torch
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import (
@@ -29,6 +31,8 @@ class UnsqueezeBeforeRepeatPass(ExportPass):
         repeat(multiples)
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call(self, graph_module: torch.fx.GraphModule):
         modified_graph = False
         for node in graph_module.graph.nodes:
diff --git a/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py b/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py
index ccae9b503cf..d3932dd1217 100644
--- a/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py
+++ b/backends/arm/_passes/unsqueeze_scalar_placeholders_pass.py
@@ -5,6 +5,8 @@
 
 # pyre-unsafe
 
+from typing import Set, Type
+
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch._export.utils import is_buffer, is_param
@@ -16,6 +18,8 @@ class UnsqueezeScalarPlaceholdersPass(ExportPass):
     This pass unsqueezes the placeholders to make sure shape is at least (1,).
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, exported_program):
         self.exported_program = exported_program
         super().__init__()
diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
index 90f9dcb8324..5e2af9c5f39 100644
--- a/backends/arm/arm_vela.py
+++ b/backends/arm/arm_vela.py
@@ -34,7 +34,10 @@ def vela_bin_pack_io(prefix, data):
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
-        assert len(io_shape) == vela_io_shape_dims
+        if len(io_shape) != vela_io_shape_dims:
+            raise ValueError(
+                f"Expected {vela_io_shape_dims}D shape, got {len(io_shape)}D"
+            )
         inp_pad = io_shape.tolist()
         io_struct = struct.pack(
             "<iiiiiiiii", *inp_pad, io_elem_size, io_offset, io_region
@@ -46,7 +49,12 @@ def vela_bin_pack_io(prefix, data):
 # Output via Vela to binary stream for ArmBackendEthosU
 # WARNING: Do not change this without changing VelaBinStream.cpp as that
 #          function consumes this format and the two need to align.
-def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False):
+def vela_compile(
+    tosa_flatbuffer: bytes,
+    args: List[str],
+    verbose: bool = False,
+    intermediate_path: str | None = None,
+):
     """
     Compile a TOSA graph to a binary stream for ArmBackendEthosU using Vela.
     """
@@ -55,14 +63,14 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             "ethos-u-vela pip package couldn't be imported. Make sure it's installed!"
         )
 
-    with tempfile.TemporaryDirectory() as tmpdir:
+    def run(dir: str) -> bytes:
         tosaname = "out.tosa"
-        tosa_path = os.path.join(tmpdir, tosaname)
+        tosa_path = os.path.join(dir, tosaname)
         with open(tosa_path, "wb") as f:
             f.write(tosa_flatbuffer)
 
         # invoke vela
-        output_dir = os.path.join(tmpdir, "output")
+        output_dir = os.path.join(dir, "output")
         args.append(f"--output-dir={output_dir}")
         args.append(tosa_path)
         if verbose:
@@ -72,9 +80,9 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
         if any("ethos-u85" in arg for arg in args) or any(
             "debug-force-regor" in arg for arg in args
         ):
-            np_path = os.path.join(tmpdir, "output", "out_vela.npz")
+            np_path = os.path.join(dir, "output", "out_vela.npz")
         else:
-            np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
+            np_path = os.path.join(dir, "output", "out_sg0_vela.npz")
 
         blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
@@ -122,3 +130,9 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
                 blocks = blocks + block
 
         return blocks
+
+    if intermediate_path is not None:
+        return run(intermediate_path)
+    else:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            return run(tmpdir)
diff --git a/backends/arm/common/arm_compile_spec.py b/backends/arm/common/arm_compile_spec.py
index c6818e2716a..b38fe72b29c 100644
--- a/backends/arm/common/arm_compile_spec.py
+++ b/backends/arm/common/arm_compile_spec.py
@@ -126,7 +126,8 @@ def validate(self):
 
     def to_list(self):
         """Get the ArmCompileSpec in list form."""
-        assert self.tosa_spec
+        if not self.tosa_spec:
+            raise ValueError("tosa_spec must be set before calling to_list()")
 
         # Always supply a TOSA version
         compile_spec = [
diff --git a/backends/arm/constants.py b/backends/arm/constants.py
index fd8710d3ead..0e562f12e88 100644
--- a/backends/arm/constants.py
+++ b/backends/arm/constants.py
@@ -29,3 +29,18 @@
     DEQUANT_PER_TENSOR_OP_T,
 )
 PER_CHANNEL_QDQ_OPS: Final = (QUANT_PER_CHANNEL_OP, DEQUANT_PER_CHANNEL_OP)
+
+NHWC_ORDER: Final = (0, 2, 3, 1)
+NHWC_INVERSE_ORDER: Final = (0, 3, 1, 2)
+NNHWC_ORDER: Final = (0, 1, 3, 4, 2)
+NNHWC_INVERSE_ORDER: Final = (0, 1, 4, 2, 3)
+NNNHWC_ORDER: Final = (0, 1, 2, 4, 5, 3)
+NNNHWC_INVERSE_ORDER: Final = (0, 1, 2, 5, 3, 4)
+
+NCHW_ORDER: Final = (0, 1, 2, 3)
+NNCHW_ORDER: Final = (0, 1, 2, 3, 4)
+NNNCHW_ORDER: Final = (0, 1, 2, 3, 4, 5)
+
+HWCM_ORDER: Final = (2, 3, 0, 1)
+
+MAX_RANK: Final = 6
diff --git a/backends/arm/ethosu/backend.py b/backends/arm/ethosu/backend.py
index b7b8798c3e6..00da88ef60b 100644
--- a/backends/arm/ethosu/backend.py
+++ b/backends/arm/ethosu/backend.py
@@ -56,6 +56,7 @@ def _compile_tosa_flatbuffer(
             tosa_flatbuffer,
             compile_flags,
             verbose=logger.getEffectiveLevel() == logging.INFO,
+            intermediate_path=compile_spec.get_intermediate_path(),
         )
         return binary
 
diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
index 7b73cddad37..53d37407ee6 100644
--- a/backends/arm/operator_support/__init__.py
+++ b/backends/arm/operator_support/__init__.py
@@ -16,8 +16,8 @@
     pool_2d_support,
     reduce_sum_support,
     right_shift_support,
-    sin_cos_support,
     slice_copy_support,
     to_dim_order_copy_support,
     tosa_supported_operators,
+    where_support,
 )
diff --git a/backends/arm/operator_support/convolution_support.py b/backends/arm/operator_support/convolution_support.py
index 6e9d3b3528e..f335c5046f5 100644
--- a/backends/arm/operator_support/convolution_support.py
+++ b/backends/arm/operator_support/convolution_support.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for ``aten.convolution`` in TOSA.
+
+Provide general checks and hardware-specific constraints (e.g., U55 subset) for
+convolution nodes prior to delegation to the TOSA backend.
+
+"""
 
 from typing import cast
 
@@ -18,6 +24,8 @@
 
 @register_tosa_support_check
 class ConvolutionSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for convolutions."""
+
     targets = [exir_ops.edge.aten.convolution.default]
 
     tosa_specs = [
@@ -25,8 +33,15 @@ class ConvolutionSupported(SupportedTOSAOperatorCheck):
         TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:
+        """Return True if the node is supported by TOSA.
 
+        Reject transposed convolutions and convolutions with non-zero output
+        padding. Apply additional hardware-specific constraints for U55.
+
+        """
         # Not implemented
         transposed = cast(bool, node.args[6])
         output_padding = cast(list[int], node.args[7])
@@ -46,9 +61,19 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         else:
             return True
 
-    def _is_node_supported_u55(self, node: fx.Node):
-        """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)"""
+    def _is_node_supported_u55(self, node: fx.Node) -> bool:
+        """Enforce Ethos-U55-specific constraints (Vela 4.2.0).
+
+        Check channel dimensions, kernel sizes, and stride/pad/dilation
+        combinations permitted on U55.
 
+        Args:
+            node (fx.Node): Convolution node to validate.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         shape_in = cast(torch.Tensor, node.all_input_nodes[0].meta["val"]).shape
         shape_out = node.meta["val"].shape
         kernel = cast(fx.Node, node.args[1]).meta["val"].shape
@@ -98,13 +123,17 @@ def _is_node_supported_u55(self, node: fx.Node):
         return True
 
     def _stride_condition(self, node: fx.Node) -> bool:
-        """This condition is somewhat complex but boils down
-        to not supporting stride > 3, unless we have some special conditions.
-        This condition is a simplified, relaxed version of the hardware constraint,
-        since the actual constraint requires information not available
-        here (without a lot of work).
+        """Check a simplified stride/padding/dilation constraint.
+
+        Disallow strides greater than 3 unless there is no padding and the
+        dilation is 1. For 3D convolutions, enforce ``stride_z <= 1``.
+
+        Args:
+            node (fx.Node): Convolution node to evaluate.
+
+        Returns:
+            bool: True if the condition is satisfied.
 
-        This means that we might accept ops that are not actually supported.
         """
         strides = cast(list[int], node.args[3])
         has_padding = any(pad > 0 for pad in cast(list[int], node.args[4]))
diff --git a/backends/arm/operator_support/embedding_support.py b/backends/arm/operator_support/embedding_support.py
index bf95014e575..24395d56cbf 100644
--- a/backends/arm/operator_support/embedding_support.py
+++ b/backends/arm/operator_support/embedding_support.py
@@ -27,11 +27,16 @@ class EmbeddingSupported(SupportedTOSAOperatorCheck):
     def is_node_tosa_supported(
         self, node: fx.Node, tosa_spec: TosaSpecification
     ) -> bool:  # type: ignore[override, misc]
-        # Note aten.embedding.default requires int64 indices and TOSA does not support it.
-        # Int32 indices here for aten.embedding.default is ok since it will be decomposed into ops that can handle it.
-        assert (
-            len(node.all_input_nodes) == 2
-        ), "Number of inputs to aten.embedding is not 2"
+        # Note aten.embedding.default requires int64 indices and TOSA does not
+        # support it. Int32 indices here for aten.embedding.default is ok since
+        # it will be decomposed into ops that can handle it.
+
+        if len(node.all_input_nodes) != 2:
+            self.reporter.report_reject(
+                node,
+                (f"Expected exactly two input nodes, got {len(node.all_input_nodes)}"),
+            )
+            return False
         indices_val = node.all_input_nodes[1].meta["val"]
         indices_dtype = indices_val.dtype
 
diff --git a/backends/arm/operator_support/ethos_u55_support.py b/backends/arm/operator_support/ethos_u55_support.py
index bf9e29d5cb7..27ddb95637b 100644
--- a/backends/arm/operator_support/ethos_u55_support.py
+++ b/backends/arm/operator_support/ethos_u55_support.py
@@ -2,6 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide Ethos-U55 specific operator support checks.
+
+Contains dtype validation, explicit unsupported-op filtering, and shape/
+permutation constraints for view and permute operations when targeting the
+Ethos-U55 subset of TOSA.
+
+"""
 
 # pyre-unsafe
 
@@ -21,6 +28,19 @@
 
 
 def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
+    """Return an inferred dtype for a node when possible.
+
+    Uses fake tensor metadata and nearby quantize/dequantize nodes to infer the
+    integer dtype used by the operator. Returns ``None`` when the dtype cannot
+    be determined reliably.
+
+    Args:
+        node (fx.Node): FX node to inspect.
+
+    Returns:
+        torch.dtype | None: Inferred dtype or ``None`` if unknown.
+
+    """
     dtype = get_first_fake_tensor(node).dtype
     if not dtype.is_floating_point:
         return dtype
@@ -34,8 +54,23 @@ def _try_determine_dtype(node: fx.Node) -> torch.dtype | None:
 
 
 class EthosU55DtypeSupport(OperatorSupportBase):
+    """Validate dtypes for U55-supported operators.
+
+    Ensures operators use a supported integer dtype according to U55
+    constraints, with specific rules for convolution, matmul, and table ops.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
@@ -52,7 +87,20 @@ def __init__(self, reporter: WhyNoPartitionReporter):
     def is_node_supported(  # noqa: C901
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return True if the node uses supported dtypes.
 
+        Applies per-operator dtype rules for U55, including specialized input
+        and weight constraints for convolution and int8-only checks for table
+        operations and matmul variants.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         dtype = _try_determine_dtype(node)
         if dtype is None:
             # If we couldn't determine dtype, just return ok.
@@ -112,10 +160,12 @@ def is_node_supported(  # noqa: C901
 
 
 class EthosU55NotSupported(OperatorSupportBase):
-    """
-    Certain operators are not supported on U55. These are listed in `unsupported_ops`.
-    The comment mentions the unsupported TOSA operator that the aten operator maps to where it is not obvious.
-    For unimplemented operators, this is the anticipated mapping, and it might be incorrect.
+    """Reject operators not supported by Ethos-U55.
+
+    The ``unsupported_ops`` list contains aten ops that either map to TOSA
+    operators the U55 cannot run or remain unimplemented. The mapping comments
+    capture expected TOSA equivalents when not obvious.
+
     """
 
     unsupported_ops = [
@@ -128,7 +178,7 @@ class EthosU55NotSupported(OperatorSupportBase):
         exir_ops.edge.aten.bitwise_and.Scalar,
         exir_ops.edge.aten.bitwise_or.Scalar,
         exir_ops.edge.aten.bitwise_xor.Scalar,
-        exir_ops.edge.aten.bitwise_not,
+        exir_ops.edge.aten.bitwise_not.default,
         exir_ops.edge.aten.logical_and.default,
         exir_ops.edge.aten.logical_or.default,
         exir_ops.edge.aten.logical_xor.default,
@@ -165,12 +215,27 @@ class EthosU55NotSupported(OperatorSupportBase):
     ]
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         self.reporter = reporter
 
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return False for nodes explicitly unsupported on U55.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
 
+        Returns:
+            bool: False if ``node.target`` is in ``unsupported_ops``; else True.
+
+        """
         if node.target in self.unsupported_ops:
             self.reporter.report_reject(node, "Op is not supported on U55.")
             return False
@@ -182,12 +247,37 @@ def is_node_supported(
 
 
 class EthosU55ViewCheck(OperatorSupportBase):
+    """Validate view/select shapes and dtypes for U55.
+
+    Performs lightweight checks on output shape rank and product constraints,
+    with awareness that transposes may be inserted around view/select during
+    lowering to channels-last.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
     def axes_product(self, nhwc_shape: shape_t) -> int:
+        """Return the product of all axes in ``nhwc_shape``.
+
+        Args:
+            nhwc_shape (list[int]): Shape in NHWC order.
+
+        Returns:
+            int: Product of the axis sizes.
+
+        """
         product = 1
         for axes in nhwc_shape:
             product *= axes
@@ -197,26 +287,27 @@ def axes_product(self, nhwc_shape: shape_t) -> int:
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
-        """
-        Check whether a given view node is supported on U55.
+        """Check whether a given view/select node is U55-supported.
 
         Currently only checks dtypes and product of axes.
 
-        It is not the view operator itself that is not supported on U55. In order for the
-        view operator to be compatible with the channels-last format of TosaBackend,
-        transposes may need to be inserted before and after the view op. If that happens
-        and that transpose operator does not adhere to the limitations then it will
-        result in the following error:
+        It is not the view operator itself that is not supported on U55. In
+        order for the view operator to be compatible with the channels-last
+        format of TosaBackend, transposes may need to be inserted before and
+        after the view op. If that happens and that transpose operator does not
+        adhere to the limitations then it will result in the following error:
 
             CPU performance estimation for "Transpose" not implemented.
             ...
             CPU operations are not supported for GraphAPI input
 
         Args:
-            node: The FX node representing the view_copy operator.
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node for ``view_copy`` or ``select``.
 
         Returns:
-            False if the operator is not support and True if it is supported.
+            bool: False if rejected by constraints; otherwise, True.
+
         """
         # Select decomposes into squeeze, which in turn becomes a view. Therefore,
         # perform the same check on select operators as view operators.
@@ -236,18 +327,20 @@ def is_node_supported(
             shape = input_node.meta["val"].shape
             rank = len(shape)
             if not -rank <= dim < rank:
-                raise IndexError(
-                    f"Dim {dim} is outside of the range for tensor '{node.target}' of "
-                    f"rank {rank}"
+                self.reporter.report_reject(
+                    node,
+                    (f"Dimension {dim} out of range for rank {rank}."),
                 )
+                return False
             dim = dim % rank
 
             size = shape[dim]
             if not -size <= index < size:
-                raise IndexError(
-                    f"Index {index} is outside of the range for dim {dim} with size "
-                    f"{size} for tensor {node.target}"
+                self.reporter.report_reject(
+                    node,
+                    (f"Index {index} out of range for dim {dim} with size {size}."),
                 )
+                return False
             index = index % size
 
             # Shape after squeeze. This may get converted into a view which may become
@@ -277,14 +370,40 @@ def is_node_supported(
 
 
 class EthosU55TransposeCheck(OperatorSupportBase):
+    """Validate permute nodes against U55 reshape/transpose limits.
+
+    Applies dtype- and rank-specific constraints to permutations. Tests both
+    NCHW and NHWC interpretations for rank-3/4 shapes since dim order is unknown
+    at partition time.
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
 
     def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
         super().__init__()
         self.reporter = reporter
 
     def _pad_to_rank_4(
         self, shape: shape_t, permutation: list[int]
     ) -> tuple[shape_t, shape_t]:
+        """Pad shape/permutation to rank 4 by prepending ones/indices.
+
+        Args:
+            shape (list[int]): Original shape.
+            permutation (list[int]): Original permutation indices.
+
+        Returns:
+            tuple[list[int], list[int]]: Padded shape and permutation.
+
+        """
         diff = 4 - len(shape)
         padded_shape = [1] * diff + shape
         for i in range(len(permutation)):
@@ -293,6 +412,15 @@ def _pad_to_rank_4(
         return padded_shape, padded_permutation
 
     def axes_product(self, nhwc_shape: shape_t) -> int:
+        """Return the product of all axes in ``nhwc_shape``.
+
+        Args:
+            nhwc_shape (list[int]): Shape in NHWC order.
+
+        Returns:
+            int: Product of the axis sizes.
+
+        """
         product = 1
         for axes in nhwc_shape:
             product *= axes
@@ -301,7 +429,7 @@ def axes_product(self, nhwc_shape: shape_t) -> int:
     def _permute_constraint_i8_i16(
         self, nhwc_shape: list[int], permutation: list[int]
     ) -> bool:
-        """Returns True if the constraints are ok."""
+        """Return True if permutation meets i8/i16 constraints."""
         N, H, W, C = nhwc_shape
         match permutation:
             case (0, 1, 2, 3):  # NHWC -> NHWC
@@ -314,7 +442,7 @@ def _permute_constraint_i8_i16(
     def _permute_constraint_i32(
         self, nhwc_shape: list[int], permutation: list[int]
     ) -> bool:
-        """Returns True if the constraints are ok."""
+        """Return True if permutation meets i32 constraints."""
         N, H, W, C = nhwc_shape
         match permutation:
             case (0, 1, 2, 3):  # NHWC -> NHWC
@@ -327,6 +455,7 @@ def _permute_constraint_i32(
                 return False
 
     def _permute_constraint(self, shape, permutation, dtype):
+        """Return True if permutation meets dtype-specific constraints."""
         if dtype in (torch.int8, torch.int16):
             return self._permute_constraint_i8_i16(shape, permutation)
         if dtype == torch.int32:
@@ -336,7 +465,19 @@ def _permute_constraint(self, shape, permutation, dtype):
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
+        """Return True if a permute node satisfies U55 constraints.
+
+        Tests both NCHW and NHWC interpretations for rank-3/4 shapes, and
+        applies dtype-specific limits to shapes and permutations.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
 
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
         if not node.target == exir_ops.edge.aten.permute_copy.default:
             return True
 
@@ -382,3 +523,63 @@ def is_node_supported(
             return False
 
         return True
+
+
+class EthosU55CastCheck(OperatorSupportBase):
+    """Reject unsupported casts on U55.
+
+    U55 does not support casting from INT32 or any casts involving BOOL. Note that
+    casting from one dtype to the same dtype is a no-op and is supported.
+
+
+    Attributes:
+        reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+    """
+
+    targets = [
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+    ]
+
+    def __init__(self, reporter: WhyNoPartitionReporter):
+        """Initialize the check with a reporter.
+
+        Args:
+            reporter (WhyNoPartitionReporter): Reporter for rejection reasons.
+
+        """
+        super().__init__()
+        self.reporter = reporter
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+        """Return True if the node satisfies the cast constraints of U55.
+
+        Args:
+            submodules (typing.Mapping[str, torch.nn.Module]): Exported modules.
+            node (fx.Node): FX node to check.
+
+        Returns:
+            bool: True if supported; otherwise, False.
+
+        """
+        if node.target not in self.targets:
+            return True
+        input_dtype = get_first_fake_tensor(node.all_input_nodes[0]).dtype
+        output_dtype = get_first_fake_tensor(node).dtype
+        if input_dtype == output_dtype:
+            # This is ok as this will not result in a cast
+            return True
+        if input_dtype in (torch.bool, torch.int32):
+            self.reporter.report_reject(
+                node, f"Casting from {input_dtype} is not supported on U55."
+            )
+            return False
+        if output_dtype in (torch.bool,):
+            self.reporter.report_reject(
+                node, f"Casting to {output_dtype} is not supported on U55."
+            )
+            return False
+
+        return True
diff --git a/backends/arm/operator_support/index_tensor_support.py b/backends/arm/operator_support/index_tensor_support.py
index 4b226a9c407..92b0ce48a32 100644
--- a/backends/arm/operator_support/index_tensor_support.py
+++ b/backends/arm/operator_support/index_tensor_support.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide TOSA support checks for ``aten.index.Tensor``.
+
+Reject unsupported patterns such as high-rank index tensors, front-positioned
+slice/ellipsis/None markers, and cases that exceed ``int32`` element limits.
+
+"""
 
 import math
 
@@ -18,7 +24,8 @@
 
 @register_tosa_support_check
 class IndexTensorSupported(SupportedTOSAOperatorCheck):
-    """
+    """Prevent partitioning of unsupported ``index.Tensor`` usages.
+
     This support check is intended to prevent the partitioning of
     currently unsupported usages of the index.Tensor operator.
 
@@ -95,6 +102,7 @@ class IndexTensorSupported(SupportedTOSAOperatorCheck):
             t[1:3, torch.arange(5), 2:3, torch.arange(3).reshape(3,1)]
         are also possible and can result in some unintuitive behaviors
         where batching and indexing are mixed together.
+
     """
 
     targets = [exir_ops.edge.aten.index.Tensor]
@@ -107,20 +115,43 @@ class IndexTensorSupported(SupportedTOSAOperatorCheck):
     def is_node_tosa_supported(
         self, node: fx.Node, tosa_spec: TosaSpecification
     ) -> bool:  # type: ignore[override, misc]
+        """Return True if ``aten.index.Tensor`` usage fits supported patterns.
+
+        Enforces the following constraints:
+        - No ``None`` (unsqueeze), slice, or ellipsis before an indexing tensor.
+        - Indexing tensors have rank <= 3.
+        - The value tensor element count fits in ``int32``.
+
+        """
         indices = node.args[1]
         for index in indices:  # type: ignore[union-attr]
             # Usage 2 guard
             if index is None:
+                self.reporter.report_reject(
+                    node,
+                    (
+                        "None (from slice/unsqueeze/ellipsis) before an indexing tensor"
+                        " is not supported."
+                    ),
+                )
                 return False
 
             # Usage 1 guard
             fake_tensor = get_first_fake_tensor(index)  # type: ignore[arg-type]
             if len(fake_tensor.size()) > 3:
+                self.reporter.report_reject(
+                    node,
+                    ("Indexing tensors of rank >= 4 is not supported."),
+                )
                 return False
 
         # Usage 3 guard
         total_vals = math.prod(get_first_fake_tensor(node.args[0]).shape)  # type: ignore[arg-type]
         if total_vals > torch.iinfo(torch.int32).max:
+            self.reporter.report_reject(
+                node,
+                ("Value size exceeds int32 range; would overflow flattened indexing."),
+            )
             return False
 
         return True
diff --git a/backends/arm/operator_support/minmax_support.py b/backends/arm/operator_support/minmax_support.py
index edbf7f61818..68433819f4b 100644
--- a/backends/arm/operator_support/minmax_support.py
+++ b/backends/arm/operator_support/minmax_support.py
@@ -32,6 +32,13 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
             )
 
             if not (no_argmax or no_argmax_users):
+                self.reporter.report_reject(
+                    node,
+                    (
+                        "Using the indices output is not supported; only usage of the "
+                        "values output is supported."
+                    ),
+                )
                 return False
 
         return True
diff --git a/backends/arm/operator_support/pool_2d_support.py b/backends/arm/operator_support/pool_2d_support.py
index ff453741f1f..c0428e45e03 100644
--- a/backends/arm/operator_support/pool_2d_support.py
+++ b/backends/arm/operator_support/pool_2d_support.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide TOSA support checks for 2D pooling.
+
+Validate ``avg_pool2d`` and ``max_pool2d_with_indices`` against U55 profile
+constraints including kernel size, stride, padding, and dimensionality.
+
+"""
 
 from typing import cast
 
@@ -20,16 +26,48 @@
 
 
 def kernel_check(kernel: tuple[int, int]) -> bool:
+    """Check if kernel size is within U55 constraints.
+
+    Checks that ``kernel_x * kernel_y`` is in ``[1, 65536]`` and
+    ``kernel_y`` is in ``[1, 256]`` as required by the U55 profile.
+
+    Args:
+        kernel (tuple[int, int]): Kernel height and width ``(kh, kw)``.
+
+    Returns:
+        bool: True if the kernel passes validation.
+
+    """
     if not (1 <= kernel[0] * kernel[1] <= 65536):
         return False
     return 1 <= kernel[1] <= 256
 
 
 def stride_check(strides: tuple[int, int]) -> bool:
+    """Check if strides are within U55 constraints.
+
+    Args:
+        strides (tuple[int, int]): Vertical and horizontal strides.
+
+    Returns:
+        bool: True if each stride is in ``[1, 3]``.
+
+    """
     return all(1 <= stride <= 3 for stride in strides)
 
 
 def dim_check(shape=torch.Size) -> bool:
+    """Check if non-batch dims are within U55 constraints.
+
+    Verifies that all dimensions except batch are in ``[1, 65536]``.
+
+    Args:
+        shape (torch.Size): Input tensor shape.
+
+    Returns:
+        bool: True if all checked dimensions pass.
+
+    """
     check = True
     for dim in shape[1:]:
         check &= 1 <= dim <= 65536
@@ -38,6 +76,13 @@ def dim_check(shape=torch.Size) -> bool:
 
 @register_tosa_support_check
 class AvgPool2dSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support checks for ``aten.avg_pool2d``.
+
+    Applies additional constraints when targeting the U55 subset, including
+    limits on kernel size, stride, padding behavior, and tensor ranks.
+
+    """
+
     targets = [
         exir_ops.edge.aten.avg_pool2d.default,
     ]
@@ -48,6 +93,12 @@ class AvgPool2dSupported(SupportedTOSAOperatorCheck):
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+        """Return True if ``avg_pool2d`` satisfies U55 constraints.
+
+        Computes the effective TOSA padding (depending on ``count_include_pad``
+        and ``divisor_override``) and validates kernel, stride, and shape limits.
+
+        """
         if not tosa_spec.is_U55_subset:
             return True
 
@@ -115,6 +166,13 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
 @register_tosa_support_check
 class MaxPool2dSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support checks for ``aten.max_pool2d_with_indices``.
+
+    Applies additional constraints when targeting the U55 subset, including
+    limits on kernel size, stride, and tensor ranks.
+
+    """
+
     targets = [
         exir_ops.edge.aten.max_pool2d_with_indices.default,
     ]
@@ -125,6 +183,9 @@ class MaxPool2dSupported(SupportedTOSAOperatorCheck):
     ]
 
     def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+        """Return True if ``max_pool2d_with_indices`` satisfies U55
+        constraints.
+        """
         if not tosa_spec.is_U55_subset:
             return True
 
diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py
index 5d3896e3643..df124319887 100644
--- a/backends/arm/operator_support/right_shift_support.py
+++ b/backends/arm/operator_support/right_shift_support.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for bitwise right-shift in TOSA.
+
+Provide support checks for ``aten.bitwise_right_shift`` and ``__rshift__``
+targets across integer and float TOSA profiles.
+
+"""
 
 # pyre-unsafe
 
@@ -21,6 +27,8 @@
 
 @register_tosa_support_check
 class RightShiftSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for right-shift operations."""
+
     targets = [
         exir_ops.edge.aten.bitwise_right_shift.Tensor,
         exir_ops.edge.aten.__rshift__.Scalar,
@@ -31,8 +39,15 @@ class RightShiftSupported(SupportedTOSAOperatorCheck):
         TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:
+        """Return True if the node is supported by TOSA.
+
+        Emit a warning on U55 subsets where one-off errors may occur. Otherwise
+        accept all matching targets.
 
+        """
         # TODO MLETORCH-525 Remove warning
         if tosa_spec.is_U55_subset:
             logging.warning(f"{node.target} may introduce one-off errors.")
diff --git a/backends/arm/operator_support/sin_cos_support.py b/backends/arm/operator_support/sin_cos_support.py
deleted file mode 100644
index dcdc20f8e4a..00000000000
--- a/backends/arm/operator_support/sin_cos_support.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-
-import torch.fx as fx
-from executorch.backends.arm.operator_support.tosa_supported_operators import (
-    register_tosa_support_check,
-    SupportedTOSAOperatorCheck,
-)
-from executorch.backends.arm.tosa import TosaSpecification
-from executorch.exir.dialects._ops import ops as exir_ops
-
-
-@register_tosa_support_check
-class SinCosSupported(SupportedTOSAOperatorCheck):
-    targets = [
-        exir_ops.edge.aten.cos.default,
-        exir_ops.edge.aten.sin.default,
-    ]
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-1.0+INT"),
-        TosaSpecification.create_from_string("TOSA-1.0+FP"),
-    ]
-
-    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
-        return True
diff --git a/backends/arm/operator_support/to_dim_order_copy_support.py b/backends/arm/operator_support/to_dim_order_copy_support.py
index e21f8a68ad6..3cc587d99d3 100644
--- a/backends/arm/operator_support/to_dim_order_copy_support.py
+++ b/backends/arm/operator_support/to_dim_order_copy_support.py
@@ -2,6 +2,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for ``_to_dim_order_copy`` in TOSA.
+
+Provide dtype-compatibility checks for casting when converting to a specific
+dimension order. Supported input/output dtype pairs depend on the active TOSA
+profile (integer and/or float).
+
+"""
 
 # pyre-unsafe
 import copy
@@ -25,6 +32,16 @@
 
 @register_tosa_support_check
 class ToCopySupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for ``_to_dim_order_copy``.
+
+    Attributes:
+        SUPPORTED_INT_PROFILE_DTYPES (dict[torch.dtype, list[torch.dtype]]):
+            Allowed output dtypes for each integer input dtype.
+        SUPPORTED_FP_PROFILE_DTYPES (dict[torch.dtype, list[torch.dtype]]):
+            Allowed output dtypes for each floating input dtype.
+
+    """
+
     targets = [
         exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
     ]
@@ -40,21 +57,31 @@ def _merge_supported_types(
         dtypes1: SupportedTypeDict,
         dtypes2: SupportedTypeDict,
     ) -> SupportedTypeDict:
+        """Return a merged mapping of supported dtype transitions.
+
+        Args:
+            dtypes1 (dict[torch.dtype, list[torch.dtype]]): Base mapping.
+            dtypes2 (dict[torch.dtype, list[torch.dtype]]): Mapping to merge in.
+
+        Returns:
+            dict[torch.dtype, list[torch.dtype]]: Combined mapping.
+
+        """
         merged_dtypes = copy.deepcopy(
             dtypes1
-        )  # Use deepcopy to avoid unintentionally modifying SUPPORTED_INT_TYPES
+        )  # Use deepcopy to avoid unintentionally modifying SUPPORTED_INT_PROFILE_DTYPES
         for k, v in dtypes2.items():
             merged_dtypes[k] = merged_dtypes.get(k, []) + v
         return merged_dtypes
 
-    SUPPORTED_INT_TYPES: SupportedTypeDict = {
+    SUPPORTED_INT_PROFILE_DTYPES: SupportedTypeDict = {
         torch.bool: [torch.bool, torch.int8, torch.int16, torch.int32],
         torch.int8: [torch.bool, torch.int8, torch.int16, torch.int32],
         torch.int16: [torch.bool, torch.int8, torch.int16, torch.int32],
         torch.int32: [torch.bool, torch.int8, torch.int16, torch.int32],
         torch.int64: [torch.bool, torch.int8, torch.int16, torch.int32],
     }
-    SUPPORTED_FLOAT_TYPES: SupportedTypeDict = {
+    SUPPORTED_FP_PROFILE_DTYPES: SupportedTypeDict = {
         torch.int8: [torch.int8, torch.float16, torch.bfloat16, torch.float32],
         torch.int16: [torch.int16, torch.float16, torch.bfloat16, torch.float32],
         torch.int32: [torch.int32, torch.float16, torch.bfloat16, torch.float32],
@@ -89,24 +116,28 @@ def _merge_supported_types(
             torch.int32,
             torch.bfloat16,
             torch.float16,
+            torch.float32,
         ],
     }
-    ALL_SUPPORTED_TYPES = _merge_supported_types(
-        SUPPORTED_INT_TYPES, SUPPORTED_FLOAT_TYPES
-    )
 
     def is_node_tosa_supported(
         self, node: fx.Node, tosa_spec: TosaSpecification
     ) -> bool:
+        """Return True if the node is supported by TOSA.
+
+        Check FakeTensor metadata, validate input dtype is supported for the
+        active profile, and ensure the output dtype is allowed for the given
+        input dtype.
 
+        """
         supported_dtypes: SupportedTypeDict = {}
         if tosa_spec.support_integer():
             supported_dtypes = self._merge_supported_types(
-                self.SUPPORTED_INT_TYPES, supported_dtypes
+                self.SUPPORTED_INT_PROFILE_DTYPES, supported_dtypes
             )
         if tosa_spec.support_float():
             supported_dtypes = self._merge_supported_types(
-                self.SUPPORTED_FLOAT_TYPES, supported_dtypes
+                self.SUPPORTED_FP_PROFILE_DTYPES, supported_dtypes
             )
 
         if len(node.all_input_nodes) != 1:
diff --git a/backends/arm/operator_support/tosa_profile_supported_op_lists.py b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
index d3207c65dff..86db2d9b0b6 100644
--- a/backends/arm/operator_support/tosa_profile_supported_op_lists.py
+++ b/backends/arm/operator_support/tosa_profile_supported_op_lists.py
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Define TOSA profile support lists for INT and FP.
+
+Expose static sets of EXIR operator overloads used by the TOSA partitioner to
+seed positive support checks for different profiles.
+
+"""
 
 import operator
 from typing import Final, Set
@@ -12,6 +18,7 @@
 
 
 # INT profile: ops supported via native TOSA ops, decompositions/transformations, precompute, TableOps, etc.
+# Note that ops supported via pre-quantization decompositions are not included here.
 TOSA_PRO_INT_SupportList: Final[Set] = {
     exir_ops.edge.aten.abs.default,
     exir_ops.edge.aten.add.Tensor,
@@ -24,6 +31,7 @@
     exir_ops.edge.aten.bitwise_and.Scalar,
     exir_ops.edge.aten.bitwise_or.Scalar,
     exir_ops.edge.aten.bitwise_xor.Scalar,
+    exir_ops.edge.aten.cos.default,
     exir_ops.edge.aten.logical_and.default,
     exir_ops.edge.aten.logical_or.default,
     exir_ops.edge.aten.logical_xor.default,
@@ -39,8 +47,6 @@
     exir_ops.edge.aten.hardsigmoid.default,
     exir_ops.edge.aten.hardtanh.default,
     exir_ops.edge.aten.hardswish.default,
-    exir_ops.edge.aten.div.Tensor,
-    exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.eq.Tensor,
     exir_ops.edge.aten.eq.Scalar,
     exir_ops.edge.aten.erf.default,
@@ -61,16 +67,7 @@
     exir_ops.edge.aten.lt.Tensor,
     exir_ops.edge.aten.lt.Scalar,
     exir_ops.edge.aten.mul.Tensor,
-    exir_ops.edge.aten.ne.Tensor,
-    exir_ops.edge.aten.ne.Scalar,
     exir_ops.edge.aten.neg.default,
-    exir_ops.edge.aten.add.Scalar,
-    exir_ops.edge.aten.sub.Scalar,
-    exir_ops.edge.aten.mul.Scalar,
-    exir_ops.edge.aten.div.Scalar,
-    exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
-    exir_ops.edge.aten.native_layer_norm.default,
-    exir_ops.edge.aten.native_group_norm.default,
     exir_ops.edge.aten.sigmoid.default,
     exir_ops.edge.aten.mean.dim,
     exir_ops.edge.aten.mm.default,
@@ -79,25 +76,17 @@
     exir_ops.edge.aten.repeat.default,
     exir_ops.edge.aten.reciprocal.default,
     exir_ops.edge.aten.relu.default,
-    exir_ops.edge.aten.leaky_relu.default,
-    exir_ops.edge.aten.sqrt.default,
     exir_ops.edge.aten.rsqrt.default,
-    exir_ops.edge.aten.round.default,
-    exir_ops.edge.aten._softmax.default,
     exir_ops.edge.aten.select_copy.int,
-    exir_ops.edge.aten._log_softmax.default,
     exir_ops.edge.aten.sub.Tensor,
     exir_ops.edge.aten.tanh.default,
     exir_ops.edge.aten.upsample_bilinear2d.vec,
     exir_ops.edge.aten.upsample_nearest2d.vec,
-    exir_ops.edge.aten.var.correction,
-    exir_ops.edge.aten.var.dim,
     exir_ops.edge.aten.view_copy.default,
     exir_ops.edge.aten.unsqueeze_copy.default,
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.pow.Tensor_Scalar,
     exir_ops.edge.aten.pow.Tensor_Tensor,
-    exir_ops.edge.aten.where.self,
     operator.getitem,
     exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
     exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
@@ -113,6 +102,7 @@
     torch.ops.aten.scalar_tensor.default,
     exir_ops.edge.aten.gelu.default,
     exir_ops.edge.aten.alias_copy.default,
+    exir_ops.edge.aten.sin.default,
     exir_ops.edge.aten.sinh.default,
     exir_ops.edge.aten.atan.default,
     exir_ops.edge.aten.acosh.default,
@@ -120,14 +110,12 @@
     exir_ops.edge.aten.sign.default,
     exir_ops.edge.aten.asin.default,
     exir_ops.edge.aten.atanh.default,
-    exir_ops.edge.aten.addmm.default,
     exir_ops.edge.aten.masked_fill.Scalar,
     exir_ops.edge.aten.asinh.default,
     exir_ops.edge.aten.cosh.default,
-    exir_ops.edge.aten.glu.default,
-    exir_ops.edge.aten.logit.default,
     exir_ops.edge.aten.acos.default,
     exir_ops.edge.aten.elu.default,
+    exir_ops.edge.aten.bitwise_not.default,
 }
 
 
@@ -147,6 +135,7 @@
     exir_ops.edge.aten.cat.default,
     exir_ops.edge.aten.ceil.default,
     exir_ops.edge.aten.clamp.default,
+    exir_ops.edge.aten.cos.default,
     exir_ops.edge.aten.cumsum.default,
     exir_ops.edge.aten.bmm.default,
     exir_ops.edge.aten.permute_copy.default,
@@ -211,7 +200,6 @@
     exir_ops.edge.aten.squeeze_copy.dims,
     exir_ops.edge.aten.pow.Tensor_Scalar,
     exir_ops.edge.aten.pow.Tensor_Tensor,
-    exir_ops.edge.aten.where.self,
     operator.getitem,
     exir_ops.edge.aten.constant_pad_nd.default,
     exir_ops.edge.aten.amax.default,
@@ -223,6 +211,7 @@
     torch.ops.aten.scalar_tensor.default,
     exir_ops.edge.aten.gelu.default,
     exir_ops.edge.aten.alias_copy.default,
+    exir_ops.edge.aten.sin.default,
     exir_ops.edge.aten.sinh.default,
     exir_ops.edge.aten.atan.default,
     exir_ops.edge.aten.acosh.default,
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index b580fbb9a9a..f7857894d40 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -19,8 +19,9 @@
     FuseQuantizedActivationPass,
 )
 from executorch.backends.arm._passes.insert_table_ops import TableOps
-from executorch.backends.arm.constants import DQ_OPS, Q_OPS
+from executorch.backends.arm.constants import DQ_OPS, MAX_RANK, Q_OPS
 from executorch.backends.arm.operator_support.ethos_u55_support import (
+    EthosU55CastCheck,
     EthosU55DtypeSupport,
     EthosU55NotSupported,
     EthosU55TransposeCheck,
@@ -126,7 +127,7 @@ def tosa_support_factory(
     negative_checks: list[OperatorSupportBase] = [
         CheckInt64InputsAndOutputs(exported_program, reporter),
         CheckFloat64Inputs(exported_program, reporter),
-        RankCheck(reporter, max_rank=5),
+        RankCheck(reporter, max_rank=MAX_RANK),
         *[
             reporter.wrap_check(check, f"Rejected by {check.__class__.__name__}")
             for check in (additional_checks if additional_checks else [])
@@ -134,13 +135,13 @@ def tosa_support_factory(
     ]
 
     if not tosa_spec.support_float():
-        negative_checks.append(NeedsDecompositionCheck(reporter))
         negative_checks.append(CheckProperQuantization(reporter))
     if tosa_spec.is_U55_subset:
         negative_checks.append(EthosU55NotSupported(reporter))
         negative_checks.append(EthosU55DtypeSupport(reporter))
         negative_checks.append(EthosU55TransposeCheck(reporter))
         negative_checks.append(EthosU55ViewCheck(reporter))
+        negative_checks.append(EthosU55CastCheck(reporter))
 
     return chain(
         reporter.wrap_check(
@@ -154,7 +155,8 @@ def tosa_support_factory(
 class TOSAProINTSupportList(OperatorSupportBase):
     """
     TOSA_PRO_INT_SupportList:
-        Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps
+        Ops supported in INT profile via native TOSA ops, decomposition/transformation, pre-compute, or TableOps.
+        Note that ops supported via pre-quantization decompositions are not included here.
     """
 
     def is_node_supported(
@@ -177,57 +179,6 @@ def is_node_supported(
         return node.op == "call_function" and node.target in TOSA_PRO_FP_SupportList
 
 
-class NeedsDecompositionCheck(OperatorSupportBase):
-    """
-    Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
-    the operator, and to get optimal quantization parameters for each operator. This check will reject operators
-    that need to be decomposed.
-    """
-
-    def __init__(self, reporter: WhyNoPartitionReporter):
-        self.reporter = reporter
-
-    def is_node_supported(
-        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
-    ) -> bool:
-
-        if node.op != "call_function":
-            return True
-
-        needs_decomp_dict = {
-            exir_ops.edge.aten.div.Tensor: None,
-            exir_ops.edge.aten._native_batch_norm_legit_no_training.default: "BatchNorm2D with track_running_stats==True not immediately following a convolution is not supported for quantized TOSA backends.",
-            exir_ops.edge.aten.native_layer_norm.default: None,
-            exir_ops.edge.aten.native_group_norm.default: None,
-            exir_ops.edge.aten._softmax.default: None,
-            exir_ops.edge.aten._log_softmax.default: None,
-            exir_ops.edge.aten.var.correction: None,
-            exir_ops.edge.aten.var.dim: None,
-            exir_ops.edge.aten.add.Scalar: None,
-            exir_ops.edge.aten.sqrt.default: None,
-            exir_ops.edge.aten.sub.Scalar: None,
-            exir_ops.edge.aten.mul.Scalar: None,
-            exir_ops.edge.aten.ne.Tensor: None,
-            exir_ops.edge.aten.ne.Scalar: None,
-            exir_ops.edge.aten.div.Scalar: None,
-            exir_ops.edge.aten.leaky_relu.default: None,
-            exir_ops.edge.aten.round.default: None,
-            exir_ops.edge.aten.addmm.default: None,
-            exir_ops.edge.aten.glu.default: None,
-            exir_ops.edge.aten.logit.default: None,
-        }
-
-        if node.target in needs_decomp_dict:
-            reject_message = needs_decomp_dict[node.target]
-            if reject_message is None:
-                reject_message = "Op needs to be decomposed into other ops before quantization to get quantized properly."
-
-            self.reporter.report_reject(node, reject_message)
-            return False
-        else:
-            return True
-
-
 class CheckProperQuantization(OperatorSupportBase):
     """
     For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize
diff --git a/backends/arm/operator_support/where_support.py b/backends/arm/operator_support/where_support.py
new file mode 100644
index 00000000000..2ec7c30827d
--- /dev/null
+++ b/backends/arm/operator_support/where_support.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+import torch.fx as fx
+from executorch.backends.arm.constants import DQ_OPS
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_tosa_support_check
+class WhereSupported(SupportedTOSAOperatorCheck):
+    targets = [exir_ops.edge.aten.where.self]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
+    ]
+
+    def is_node_tosa_supported(
+        self, node: fx.Node, tosa_spec: TosaSpecification
+    ) -> bool:  # type: ignore[override, misc]
+
+        if len(node.all_input_nodes) != 3:
+            self.reporter.report_reject(
+                node,
+                (
+                    "Expected exactly three input nodes, "
+                    f"got {len(node.all_input_nodes)} for {node.target}."
+                ),
+            )
+            return False
+
+        condition, x, y = node.all_input_nodes
+        if condition.meta["val"].dtype != torch.bool:
+            self.reporter.report_reject(
+                node,
+                f"Type of condition in {node.target} is not torch.bool",
+            )
+            return False
+
+        x_dtype, y_dtype = x.meta["val"].dtype, y.meta["val"].dtype
+        if tosa_spec.support_float():
+            if x_dtype in (torch.bool, torch.float16, torch.float32) and y_dtype in (
+                torch.bool,
+                torch.float16,
+                torch.float32,
+            ):
+                return True
+
+        if tosa_spec.support_integer():
+            if (
+                x_dtype in (torch.bool, torch.int8, torch.int16, torch.int32)
+                or (x_dtype == torch.float32 and x.target in DQ_OPS)
+            ) and (
+                y_dtype in (torch.bool, torch.int8, torch.int16, torch.int32)
+                or (y_dtype == torch.float32 and y.target in DQ_OPS)
+            ):
+                return True
+
+        self.reporter.report_reject(
+            node,
+            (
+                f"Tensor x dtype {x_dtype} and/or tensor y dtype {y_dtype} is not supported in {node.target} "
+                f"for tosa specification {tosa_spec}"
+            ),
+        )
+
+        return False
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index f7a9638254e..9278d25959f 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -13,7 +13,7 @@
     op_amin,
     op_any,
     op_avg_pool2d,
-    op_bmm,
+    op_bitwise_not,
     op_cat,
     op_ceil,
     op_clamp,
@@ -41,7 +41,6 @@
     op_pow,
     op_reciprocal,
     op_repeat,
-    op_rescale,
     op_rshift_tensor,
     op_rsqrt,
     op_sigmoid,
@@ -49,12 +48,13 @@
     op_slice,
     op_sub,
     op_sum,
-    op_table,
     op_tanh,
     op_to_dim_order_copy,
-    op_transpose,
-    op_upsample_bilinear2d,
-    op_upsample_nearest2d,
+    op_tosa_matmul,
+    op_tosa_rescale,
+    op_tosa_resize,
+    op_tosa_table,
+    op_tosa_transpose,
     op_view,
     op_where,
     ops_binary,
diff --git a/backends/arm/operators/op_abs.py b/backends/arm/operators/op_abs.py
index ec76eb5517f..943c4778867 100644
--- a/backends/arm/operators/op_abs.py
+++ b/backends/arm/operators/op_abs.py
@@ -6,9 +6,6 @@
 # pyre-unsafe
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-import executorch.backends.arm.tosa.utils as tutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -18,22 +15,20 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
 @register_node_visitor
-class AbsVisitor_INT(NodeVisitor):
+class AbsVisitor(NodeVisitor):
     target = "aten.abs.default"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def define_node(
         self,
         node: Node,
@@ -47,89 +42,18 @@ def define_node(
         validate_num_inputs(self.target, inputs, 1)
         validate_same_dtype(self.target, [*inputs, output], ts)
 
-        # Handle int8 (quantized) and int32
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        if inputs[0].dtype == ts.DType.INT8:
-            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )  # type: ignore[possibly-undefined]
-        else:
-            # input[0].dtype == ts.DType.INT32
-            # Non quantized input, natively support by TOSA.abs
-            rescaled_inputs = inputs
-
-        if output.dtype == ts.DType.INT8:
-            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
-            abs_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
-        else:
-            # output.dtype == ts.DType.INT32
-            abs_output = output
-
-        # Do the INT32 Abs
-        self._serialize_operator(
-            node,
-            tosa_graph,
+        tosa_graph.addOperator(
             ts.TosaOp.Op().ABS,
             [
-                rescaled_inputs[0].name,
+                inputs[0].name,
             ],
-            [abs_output.name],
+            [output.name],
             None,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # Scale output back to 8 bit
-            # pyre-ignore
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, abs_output, scale_back, node, self.tosa_spec
-            )  # type: ignore[possibly-undefined]
-
-
-@register_node_visitor
-class AbsVisitor_FP(AbsVisitor_INT):
-    # inheriting 'target' from BI class
-
-    tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-
-        import serializer.tosa_serializer as ts  # type: ignore
-
-        validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output], ts)
-
-        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
-            # Call the inherited define_node for handling integers
-            super().define_node(node, tosa_graph, inputs, output)
-        else:
-            # FP32 Abs lowering
-
-            validate_valid_dtype(
-                self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec
-            )
-
-            # MI lowering
-            self._serialize_operator(
-                node,
-                tosa_graph,
-                ts.TosaOp.Op().ABS,
-                [inputs[0].name],
-                [output.name],
-                None,
-            )
diff --git a/backends/arm/operators/op_add.py b/backends/arm/operators/op_add.py
index a8f0c3fe14d..81b415363ea 100644
--- a/backends/arm/operators/op_add.py
+++ b/backends/arm/operators/op_add.py
@@ -64,12 +64,18 @@ def define_node(
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32_maxscale(
                 tosa_graph, inputs, node, self.tosa_spec
             )
+        elif inputs[0].dtype == ts.DType.INT16:
+            rescaled_inputs, scale_back = (
+                tqutils.insert_rescale_ops_int16_to_int32_maxscale(
+                    tosa_graph, inputs, node, self.tosa_spec
+                )
+            )
         else:
             # input[0].dtype == ts.DType.INT16 or ts.DType.INT32
             # Non quantized input, natively support by TOSA.ADD
             rescaled_inputs = inputs
 
-        if output.dtype == ts.DType.INT8:
+        if output.dtype in [ts.DType.INT8, ts.DType.INT16]:
             broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
             add_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
         else:
@@ -99,6 +105,15 @@ def define_node(
                 compute_rescale=False,
                 tosa_spec=self.tosa_spec,
             )  # type: ignore[possibly-undefined]
+        elif output.dtype == ts.DType.INT16:
+            tqutils.insert_rescale_op_to_int16(
+                tosa_graph,
+                add_output,
+                scale_back,
+                node,
+                compute_rescale=False,
+                tosa_spec=self.tosa_spec,
+            )  # type: ignore[possibly-undefined]
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_bitwise_not.py b/backends/arm/operators/op_bitwise_not.py
new file mode 100644
index 00000000000..908cf68e9b2
--- /dev/null
+++ b/backends/arm/operators/op_bitwise_not.py
@@ -0,0 +1,59 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List
+
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.operators.operator_validation_utils import (
+    validate_num_inputs,
+    validate_same_dtype,
+    validate_valid_dtype,
+)
+from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.specification import TosaSpecification
+from torch.fx import Node
+
+
+@register_node_visitor
+class BitwiseNotVisitor(NodeVisitor):
+    target = "aten.bitwise_not.default"
+
+    # bitwise_not is not supported on the FP profile
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: Any,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+
+        import serializer.tosa_serializer as ts  # type: ignore
+
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output], ts)
+        validate_valid_dtype(
+            self.target,
+            [*inputs, output],
+            [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32],
+            output.tosa_spec,
+        )
+
+        self._serialize_operator(
+            node,
+            tosa_graph,
+            ts.TosaOp.Op().BITWISE_NOT,
+            [inputs[0].name],
+            [output.name],
+        )
diff --git a/backends/arm/operators/op_conv2d.py b/backends/arm/operators/op_conv2d.py
index 6bfe0ab21eb..933e353387b 100644
--- a/backends/arm/operators/op_conv2d.py
+++ b/backends/arm/operators/op_conv2d.py
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide a visitor for lowering 2D convolution to TOSA (INT/FP)."""
+
 import itertools
 from typing import Any, List
 
@@ -19,15 +21,22 @@
 )
 from executorch.backends.arm.operators.operator_validation_utils import (
     validate_num_inputs,
+    validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
 from executorch.backends.arm.tosa.quant_utils import build_rescale
+from executorch.backends.arm.tosa.specification import Tosa_1_00, TosaSpecification
 from executorch.backends.arm.tosa.utils import tosa_shape
 
 
 @register_node_visitor
 class Conv2dVisitor(NodeVisitor):
+    """Provide a visitor that lowers ``aten.convolution`` to TOSA.
+
+    Map to ``CONV2D`` or ``DEPTHWISE_CONV2D`` as appropriate.
+
+    """
+
     target = "aten.convolution.default"
 
     tosa_specs = [
@@ -38,13 +47,32 @@ class Conv2dVisitor(NodeVisitor):
     def __init__(self, *args):
         super().__init__(*args)
 
-    # torch.nn.Conv2d does not require the result of
-    # `(input + 2 * pad - dilation * (weight - 1) - 1) / stride`
-    # to be an integer, but tosa currently strictly require this property.
-    # This function adjusts the pad value to meet the requirement.
     def adjust_pad_if_needed(
         self, input_size: int, input_weight: int, stride: int, pad: int, dilation: int
     ) -> int:
+        """Adjust padding to satisfy TOSA's integer output-size requirement.
+
+        Torch ``Conv2d`` does not require the result of
+        ``(input + 2 * pad - dilation * (weight - 1) - 1) / stride`` to be an
+        integer, but TOSA does. This helper reduces the provided padding so
+        that the expression becomes divisible by ``stride``.
+
+        Args:
+            input_size (int): Spatial input size along the dimension (H or W).
+            input_weight (int): Kernel size along the same dimension.
+            stride (int): Stride along the same dimension.
+            pad (int): Padding value to adjust (bottom or right after duplication).
+            dilation (int): Dilation along the same dimension.
+
+        Returns:
+            int: Adjusted padding value that yields an integer output size.
+
+        Raises:
+            RuntimeError: If the required adjustment exceeds the provided
+                padding, which should be handled by the ``SizeAdjustInputPass``
+                pass instead.
+
+        """
         mod_remainder = (
             input_size + 2 * pad - dilation * (input_weight - 1) - 1
         ) % stride
@@ -55,7 +83,8 @@ def adjust_pad_if_needed(
 
         if mod_remainder > pad:
             raise RuntimeError(
-                "This case should be handled by the SizeAdjustConv2d pass, is it enabled?"
+                "This case should be handled by the SizeAdjustInputPass pass, "
+                "is it enabled?"
             )
         return pad - mod_remainder
 
@@ -66,13 +95,39 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-
+        """Define the TOSA CONV2D/DEPTHWISE_CONV2D operator and post-rescale."""
         import serializer.tosa_serializer as ts  # type: ignore
         from tosa.RoundingMode import RoundingMode  # type: ignore
 
         input, weight, bias, stride, pad, dilation, _, _, group = inputs
         validate_num_inputs(self.target, inputs, 9)
 
+        valid_input_dtypes = []
+        if self.tosa_spec.support_float():
+            valid_input_dtypes.append(ts.DType.FP32)
+        if self.tosa_spec.support_integer():
+            valid_input_dtypes.append(ts.DType.INT8)
+
+        if isinstance(self.tosa_spec, Tosa_1_00) and self.tosa_spec.support_extension(
+            "int16"
+        ):
+            valid_input_dtypes.append(ts.DType.INT16)
+            # Check constraints for int16 activations
+            if inputs[0].dtype == ts.DType.INT16:
+                validate_valid_dtype(
+                    self.target, [inputs[1]], [ts.DType.INT8], self.tosa_spec
+                )
+                validate_valid_dtype(
+                    self.target, [inputs[2]], [ts.DType.INT48], self.tosa_spec
+                )
+
+        validate_valid_dtype(
+            self.target,
+            [inputs[0]],
+            valid_input_dtypes,
+            self.tosa_spec,
+        )
+
         # Get the attributes of convolution.
         attr = ts.TosaSerializerAttribute()
         pad_attr = [val for val in pad.special for _ in (0, 1)]
@@ -97,8 +152,8 @@ def define_node(
         )
 
         input_zp = 0
-        if inputs[0].dtype == ts.DType.INT8:
-            # int8 input requires quantization information
+        if inputs[0].dtype in (ts.DType.INT8, ts.DType.INT16):
+            # int8 and int16 input requires quantization information
             input_qparams = get_input_qparams(node)
             input_zp = input_qparams[0].get_zp_per_tensor()
 
@@ -109,22 +164,29 @@ def define_node(
             weight_zp = input_qparams[1].zp  # type: ignore[assignment]
 
         # The output type is int32 when input type is int8.
-        conv2d_output_name = output.name
-        if output.dtype == ts.DType.INT8:
+        if inputs[0].dtype == ts.DType.INT8:
             conv2d_res = tosa_graph.addIntermediate(
                 tosa_shape(output.shape, output.dim_order), ts.DType.INT32
             )
             conv2d_output_name = conv2d_res.name
-        acc_type = (
-            inputs[0].dtype if inputs[0].dtype == ts.DType.FP32 else ts.DType.INT32
-        )
+            acc_type = ts.DType.INT32
+        elif inputs[0].dtype == ts.DType.INT16:
+            conv2d_res = tosa_graph.addIntermediate(
+                tosa_shape(output.shape, output.dim_order), ts.DType.INT48
+            )
+            conv2d_output_name = conv2d_res.name
+            acc_type = ts.DType.INT48
+        else:
+            conv2d_output_name = output.name
+            conv2d_res = output
+            acc_type = ts.DType.FP32
 
         tosa_graph.addConst(
-            [1], output.dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
+            [1], inputs[0].dtype, [input_zp], name=f"{conv2d_output_name}_input_zp"
         )
         tosa_graph.addConst(
             [1],
-            output.dtype,
+            inputs[1].dtype,
             weight_zp,
             name=f"{conv2d_output_name}_weight_zp",
         )
@@ -133,7 +195,7 @@ def define_node(
         in_channels = input.shape[1]
         out_channels = weight.shape[0]
         if (in_channels == group.number) and (out_channels % in_channels) == 0:
-            """Depthwise convolution case"""
+            """Depthwise convolution case."""
             # Reshape torch shape format of weight tensor to tosa required format.
             # https://www.mlplatform.org/tosa/tosa_spec.html#_depthwise_conv2d
             m_length = int(out_channels / in_channels)
@@ -178,7 +240,7 @@ def define_node(
                 acc_type=acc_type,
             )
         else:
-            """Regular convolution case"""
+            """Regular convolution case."""
             tosa_op = ts.TosaOp.Op().CONV2D
             weight_name = weight.name
 
@@ -207,7 +269,7 @@ def define_node(
 
         # For quantized convolution, rescale the output value back to the same
         # integer value domain of the next op. Otherwise return float32 output.
-        if inputs[0].dtype == ts.DType.INT8:
+        if output.dtype == ts.DType.INT8 or output.dtype == ts.DType.INT16:
             # Get scale_factor from input, weight, and output.
             input_scale = input_qparams[0].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore [61]
             per_channel_quant = input_qparams[1].per_channel  # pyre-ignore [61]
diff --git a/backends/arm/operators/op_eq.py b/backends/arm/operators/op_eq.py
index 2136fe2e946..76b6e67cd8d 100644
--- a/backends/arm/operators/op_eq.py
+++ b/backends/arm/operators/op_eq.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,23 +54,12 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         # Do the equal comparison
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().EQUAL,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_ge.py b/backends/arm/operators/op_ge.py
index c538e735880..4bb20cac77f 100644
--- a/backends/arm/operators/op_ge.py
+++ b/backends/arm/operators/op_ge.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER_EQUAL,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_gt.py b/backends/arm/operators/op_gt.py
index d407e28c1b6..c25c959681e 100644
--- a/backends/arm/operators/op_gt.py
+++ b/backends/arm/operators/op_gt.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER,
-            [input_nodes[0].name, input_nodes[1].name],
+            [inputs[0].name, inputs[1].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_le.py b/backends/arm/operators/op_le.py
index 403c6c233d3..e62d669814f 100644
--- a/backends/arm/operators/op_le.py
+++ b/backends/arm/operators/op_le.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER_EQUAL,
-            [input_nodes[1].name, input_nodes[0].name],
+            [inputs[1].name, inputs[0].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_lt.py b/backends/arm/operators/op_lt.py
index f5132dd4feb..cccb0abd5d7 100644
--- a/backends/arm/operators/op_lt.py
+++ b/backends/arm/operators/op_lt.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -56,22 +54,11 @@ def define_node(
         )
         validate_valid_dtype(self.target, output, ts.DType.BOOL, output.tosa_spec)
 
-        input_nodes = inputs
-        # Handle quantization
-        if inputs[0].dtype == ts.DType.INT8:
-            # Rescale inputs to 32 bit
-            rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            # Update IO
-            input_nodes = rescaled_inputs
-
         self._serialize_operator(
             node,
             tosa_graph,
             ts.TosaOp.Op().GREATER,
-            [input_nodes[1].name, input_nodes[0].name],
+            [inputs[1].name, inputs[0].name],
             [output.name],
             None,
         )
diff --git a/backends/arm/operators/op_maximum.py b/backends/arm/operators/op_maximum.py
index 66437f8af1d..50c6e06a4bb 100644
--- a/backends/arm/operators/op_maximum.py
+++ b/backends/arm/operators/op_maximum.py
@@ -7,12 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
-
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -22,9 +16,8 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
@@ -56,35 +49,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        max_output = output
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            if len(input_qparams) != 2:
-                raise ValueError(
-                    f"Both inputs need to have quantization information for {node}"
-                )
-            if input_qparams[0] != input_qparams[1]:
-                raise ValueError(
-                    "Both inputs must have the same quantization parameters for MAX"
-                )
-
-            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            output.shape = tosa_shape(output.shape, output.dim_order)
-            max_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-        else:
-            operand_inputs = inputs
-
         attr_maximum = ts.TosaSerializerAttribute()
-
-        # Set to PROPOGATE as default
+        # Set to PROPAGATE as default
         attr_maximum.MaximumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
 
         self._serialize_operator(
@@ -92,15 +62,9 @@ def define_node(
             tosa_graph,
             ts.TosaOp.Op().MAXIMUM,
             [
-                operand_inputs[0].name,
-                operand_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [max_output.name],
+            [output.name],
             attr_maximum,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # insert RESCALE from int32 back to int8
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, max_output, scale_back, node, self.tosa_spec
-            )
diff --git a/backends/arm/operators/op_minimum.py b/backends/arm/operators/op_minimum.py
index 518366d5463..d5b97f186d3 100644
--- a/backends/arm/operators/op_minimum.py
+++ b/backends/arm/operators/op_minimum.py
@@ -7,11 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-
-from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
-    get_input_qparams,
-)
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
     register_node_visitor,
@@ -23,7 +18,6 @@
 )
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.utils import tosa_shape
 from torch.fx import Node
 
 
@@ -55,35 +49,12 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        min_output = output
-        if inputs[0].dtype == ts.DType.INT8:
-            input_qparams = get_input_qparams(node)
-            if len(input_qparams) != 2:
-                raise ValueError(
-                    f"Both inputs need to have quantization information for {node}"
-                )
-            if input_qparams[0] != input_qparams[1]:
-                raise ValueError(
-                    "Both inputs must have the same quantization parameters for MIN"
-                )
-
-            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-
-            output.shape = tosa_shape(output.shape, output.dim_order)
-            min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-        else:
-            operand_inputs = inputs
-
         attr_minimum = ts.TosaSerializerAttribute()
-
-        # Set to PROPOGATE as default
+        # Set to PROPAGATE as default
         attr_minimum.MinimumAttribute(nan_mode=NanPropagationMode.PROPAGATE)
 
         self._serialize_operator(
@@ -91,15 +62,9 @@ def define_node(
             tosa_graph,
             ts.TosaOp.Op().MINIMUM,
             [
-                operand_inputs[0].name,
-                operand_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [min_output.name],
+            [output.name],
             attr_minimum,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # insert RESCALE from int32 back to int8
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph, min_output, scale_back, node, self.tosa_spec
-            )
diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py
index 5db7ce9347c..9ee4e9fedf8 100644
--- a/backends/arm/operators/op_repeat.py
+++ b/backends/arm/operators/op_repeat.py
@@ -44,7 +44,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT8, ts.DType.INT32, ts.DType.INT16, ts.DType.FP32],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
index 9c27fddf68a..5f037dc3d1c 100644
--- a/backends/arm/operators/op_sub.py
+++ b/backends/arm/operators/op_sub.py
@@ -50,7 +50,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32],
+            [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32],
             output.tosa_spec,
         )
 
@@ -59,12 +59,18 @@ def define_node(
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32_maxscale(
                 tosa_graph, inputs, node, self.tosa_spec
             )
+        elif inputs[0].dtype == ts.DType.INT16:
+            rescaled_inputs, scale_back = (
+                tqutils.insert_rescale_ops_int16_to_int32_maxscale(
+                    tosa_graph, inputs, node, self.tosa_spec
+                )
+            )
         else:
             # input[0].dtype == ts.DType.INT32
             # Non quantized input, natively support by TOSA.SUB
             rescaled_inputs = inputs
 
-        if output.dtype == ts.DType.INT8:
+        if output.dtype in [ts.DType.INT8, ts.DType.INT16]:
             broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
             sub_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
         else:
@@ -95,6 +101,15 @@ def define_node(
                 compute_rescale=False,
                 tosa_spec=self.tosa_spec,
             )  # type: ignore[possibly-undefined]
+        elif output.dtype == ts.DType.INT16:
+            tqutils.insert_rescale_op_to_int16(
+                tosa_graph,
+                sub_output,
+                scale_back,
+                node,
+                compute_rescale=False,
+                tosa_spec=self.tosa_spec,
+            )  # type: ignore[possibly-undefined]
 
 
 @register_node_visitor
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_tosa_matmul.py
similarity index 53%
rename from backends/arm/operators/op_bmm.py
rename to backends/arm/operators/op_tosa_matmul.py
index 382386ffa26..b177fd2ba37 100644
--- a/backends/arm/operators/op_bmm.py
+++ b/backends/arm/operators/op_tosa_matmul.py
@@ -5,13 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide a visitor for lowering batched matmul (BMM) to TOSA."""
+
 from typing import Any, List
 
 import torch
 
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     get_input_qparams,
-    get_output_qparams,
 )
 from executorch.backends.arm.operators.node_visitor import (
     NodeVisitor,
@@ -24,13 +25,13 @@
 )
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.quant_utils import build_rescale
-from tosa.RoundingMode import RoundingMode  # type: ignore
 
 
 @register_node_visitor
-class BMMVisitor(NodeVisitor):
-    target = "aten.bmm.default"
+class MatmulVisitor(NodeVisitor):
+    """Provide a visitor that serializes TOSA ``MATMUL``."""
+
+    target = "tosa.MATMUL.default"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
@@ -47,35 +48,36 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
     ) -> None:
-
+        """Define the TOSA ``MATMUL`` operator."""
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output], ts)
+        validate_same_dtype(self.target, [*inputs], ts)
         validate_valid_dtype(
             self.target,
-            [*inputs, output],
+            [*inputs],
             [ts.DType.INT8, ts.DType.INT16, ts.DType.FP32],
             output.tosa_spec,
         )
+        validate_valid_dtype(
+            self.target,
+            [output],
+            [ts.DType.INT32, ts.DType.INT48, ts.DType.FP32],
+            output.tosa_spec,
+        )
 
-        # aten.bmm maps directly to MATMUL
-
-        # For INT8, we need to get the zero points and add an intermediate tensor
-        # for a later rescale.
-
-        if inputs[0].dtype == ts.DType.INT8:
+        # We need to get the zero points and add an intermediate tensor for INT16 case
+        if inputs[0].dtype in (ts.DType.INT8, ts.DType.INT16):
             input_qparams = get_input_qparams(node)
             input0_zp = input_qparams[0].get_zp_per_tensor()
             input1_zp = input_qparams[1].get_zp_per_tensor()
-            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
-            bmm_output_name = bmm_result.name
         else:
-            bmm_output_name = output.name
             input0_zp, input1_zp = 0, 0
 
-        tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=f"{node.name}_A_ZP")
-        tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=f"{node.name}_B_ZP")
+        input_A_ZP_name = f"{node.name}_A_ZP"
+        input_B_ZP_name = f"{node.name}_B_ZP"
+        tosa_graph.addConst([1], inputs[0].dtype, [input0_zp], name=input_A_ZP_name)
+        tosa_graph.addConst([1], inputs[1].dtype, [input1_zp], name=input_B_ZP_name)
 
         # Add the MATMUL to the TOSA graph.
         self._serialize_operator(
@@ -85,27 +87,8 @@ def define_node(
             [
                 inputs[0].name,
                 inputs[1].name,
-                f"{node.name}_A_ZP",
-                f"{node.name}_B_ZP",
+                input_A_ZP_name,
+                input_B_ZP_name,
             ],
-            [bmm_output_name],
+            [output.name],
         )
-
-        # As INT8 accumulates into INT32, we need to rescale it back to INT8
-        if output.dtype == ts.DType.INT8:
-            output_qparams = get_output_qparams(node)[0]
-            final_output_scale = (
-                input_qparams[0].get_scale_per_tensor() * input_qparams[1].get_scale_per_tensor()  # type: ignore[possibly-undefined]  # pyre-ignore[61]
-            ) / output_qparams.get_scale_per_tensor()
-
-            build_rescale(
-                tosa_fb=tosa_graph,
-                scale=[final_output_scale],
-                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
-                input_node=bmm_result,  # type: ignore[possibly-undefined]
-                output_name=output.name,
-                output_type=ts.DType.INT8,
-                input_zp=[0],
-                output_zp=[output_qparams.get_zp_per_tensor()],
-                rounding_mode=RoundingMode.SINGLE_ROUND,
-            )
diff --git a/backends/arm/operators/op_rescale.py b/backends/arm/operators/op_tosa_rescale.py
similarity index 100%
rename from backends/arm/operators/op_rescale.py
rename to backends/arm/operators/op_tosa_rescale.py
diff --git a/backends/arm/operators/op_upsample_nearest2d.py b/backends/arm/operators/op_tosa_resize.py
similarity index 82%
rename from backends/arm/operators/op_upsample_nearest2d.py
rename to backends/arm/operators/op_tosa_resize.py
index 3c3ca67c9f5..020395ee7c2 100644
--- a/backends/arm/operators/op_upsample_nearest2d.py
+++ b/backends/arm/operators/op_tosa_resize.py
@@ -24,8 +24,8 @@
 
 
 @register_node_visitor
-class UpsampleNearest2dVisitor(NodeVisitor):
-    target = "aten.upsample_nearest2d.vec"
+class ResizeVisitor(NodeVisitor):
+    target = "tosa.RESIZE.default"
 
     tosa_specs = NodeVisitor.tosa_specs
 
@@ -41,12 +41,18 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts
 
-        validate_num_inputs(self.target, inputs, 3)
-        validate_same_dtype(self.target, [inputs[0], output], ts)
+        validate_num_inputs(self.target, inputs, [3, 4])
+        if node.kwargs.get("resize_mode") == "bilinear":
+            resize_mode = ResizeMode.BILINEAR
+            align_corners = bool(node.args[2])
+        else:
+            resize_mode = ResizeMode.NEAREST
+            align_corners = False
+            validate_same_dtype(self.target, [inputs[0], output], ts)
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
+            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP16, ts.DType.FP32],
             output.tosa_spec,
         )
 
@@ -59,7 +65,7 @@ def define_node(
         # Align corners shouldn't make a difference for nearest upsampling. We set to False so
         # half pixel centers are used for resize parameter logic.
         scale_n_yx, scale_d_yx, offset_yx, border_yx = get_resize_parameters(
-            input_size_yx, output_size_yx, ResizeMode.NEAREST, align_corners=False
+            input_size_yx, output_size_yx, resize_mode, align_corners=align_corners
         )
 
         def in_int16_range(x):
@@ -86,7 +92,7 @@ def in_int16_range(x):
         )
         attr = ts.TosaSerializerAttribute()
         attr.ResizeAttribute(
-            mode=ResizeMode.NEAREST,
+            mode=resize_mode,
         )
 
         self._serialize_operator(
diff --git a/backends/arm/operators/op_table.py b/backends/arm/operators/op_tosa_table.py
similarity index 100%
rename from backends/arm/operators/op_table.py
rename to backends/arm/operators/op_tosa_table.py
diff --git a/backends/arm/operators/op_transpose.py b/backends/arm/operators/op_tosa_transpose.py
similarity index 100%
rename from backends/arm/operators/op_transpose.py
rename to backends/arm/operators/op_tosa_transpose.py
diff --git a/backends/arm/operators/op_upsample_bilinear2d.py b/backends/arm/operators/op_upsample_bilinear2d.py
deleted file mode 100644
index 3cc620727e0..00000000000
--- a/backends/arm/operators/op_upsample_bilinear2d.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-from typing import Any, List
-
-import torch
-
-from executorch.backends.arm.operators.node_visitor import (
-    NodeVisitor,
-    register_node_visitor,
-)
-from executorch.backends.arm.operators.operator_validation_utils import (
-    validate_num_inputs,
-    validate_same_dtype,
-    validate_valid_dtype,
-)
-from executorch.backends.arm.tosa.mapping import TosaArg
-from executorch.backends.arm.tosa.quant_utils import build_rescale
-from executorch.backends.arm.tosa.utils import get_resize_parameters, tosa_shape
-
-
-@register_node_visitor
-class UpsampleBilinear2dVisitor(NodeVisitor):
-
-    target = "aten.upsample_bilinear2d.vec"
-    tosa_specs = NodeVisitor.tosa_specs
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: torch.fx.Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        import serializer.tosa_serializer as ts
-        from tosa.ResizeMode import ResizeMode  # type: ignore
-        from tosa.RoundingMode import RoundingMode  # type: ignore
-
-        validate_num_inputs(self.target, inputs, 4)
-        validate_same_dtype(self.target, [inputs[0], output], ts)
-        validate_valid_dtype(
-            self.target,
-            [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32],
-            output.tosa_spec,
-        )
-
-        if inputs[0].shape is None or output.shape is None:
-            raise ValueError("Only static shapes are supported")
-
-        input_dtype = inputs[0].dtype
-
-        # tosa_shape output is NHWC, take HW
-        input_size_yx = tuple([inputs[0].shape[dim] for dim in inputs[0].dim_order])[
-            1:3
-        ]
-        output_size_yx = tuple([output.shape[dim] for dim in output.dim_order])[1:3]
-
-        # Get align_corners value from the node arguments.
-        align_corners = bool(node.args[2])
-        scale_n_yx, scale_d_yx, offset_yx, border_yx = get_resize_parameters(
-            input_size_yx,
-            output_size_yx,
-            ResizeMode.NEAREST,
-            align_corners=align_corners,
-        )
-
-        def in_int16_range(x):
-            return torch.all(x >= -(2**15)) and torch.all(x <= 2**15 - 1)
-
-        if not in_int16_range(scale_n_yx):
-            raise ValueError("scale_n_yx is out of the int16 range")
-        if not in_int16_range(scale_d_yx):
-            raise ValueError("scale_d_yx is out of the int16 range")
-        if not in_int16_range(border_yx):
-            raise ValueError("border_yx is out of the int16 range")
-
-        scales = [scale_n_yx[0], scale_d_yx[0], scale_n_yx[1], scale_d_yx[1]]
-
-        attr = ts.TosaSerializerAttribute()
-        attr.ResizeAttribute(mode=ResizeMode.BILINEAR)
-
-        scales_tensor = tosa_graph.addConst(
-            [len(scales)], ts.DType.SHAPE, scales, node.name + "_scales"
-        )
-        offset = offset_yx.tolist()
-        offset_tensor = tosa_graph.addConst(
-            [len(offset)], ts.DType.SHAPE, offset, node.name + "_offset"
-        )
-        border = border_yx.tolist()
-        border_tensor = tosa_graph.addConst(
-            [len(border)], ts.DType.SHAPE, border, node.name + "_border"
-        )
-        if input_dtype == output.dtype == ts.DType.FP32:
-            self._serialize_operator(
-                node,
-                tosa_graph,
-                ts.TosaOp.Op().RESIZE,
-                [
-                    inputs[0].name,
-                    scales_tensor.name,
-                    offset_tensor.name,
-                    border_tensor.name,
-                ],
-                [output.name],
-                attr,
-            )
-            return
-        elif input_dtype == output.dtype == ts.DType.INT8:
-            intermediate = tosa_graph.addIntermediate(
-                tosa_shape(output.shape, output.dim_order), ts.DType.INT32
-            )
-            self._serialize_operator(
-                node,
-                tosa_graph,
-                ts.TosaOp.Op().RESIZE,
-                [
-                    inputs[0].name,
-                    scales_tensor.name,
-                    offset_tensor.name,
-                    border_tensor.name,
-                ],
-                [intermediate.name],
-                attr,
-            )
-
-            final_output_scale = float(1 / (scale_n_yx[0] * scale_n_yx[1]))
-
-            build_rescale(
-                tosa_fb=tosa_graph,
-                scale=[final_output_scale],
-                input_node=intermediate,
-                output_name=output.name,
-                output_type=ts.DType.INT8,
-                input_zp=[0],
-                output_zp=[0],
-                rounding_mode=RoundingMode.SINGLE_ROUND,
-            )
-        else:
-            raise ValueError(
-                "Input/output dtype not in {float32, int8}: {input_dtype=} {output.dtype=}"
-            )
diff --git a/backends/arm/process_node.py b/backends/arm/process_node.py
index 9ca435c60c5..8865513a6dd 100644
--- a/backends/arm/process_node.py
+++ b/backends/arm/process_node.py
@@ -12,7 +12,7 @@
 import torch
 import torch.fx
 from executorch.backends.arm.operators.node_visitor import NodeVisitor
-from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.mapping import TosaArg, TosaSpecialDtype
 from executorch.backends.arm.tosa.specification import TosaSpecification
 from executorch.backends.arm.tosa.utils import tosa_shape
 from torch._export.utils import (
@@ -70,13 +70,6 @@ def process_inputs(
     tosa_spec: TosaSpecification,
 ):
     """Serialize an input node"""
-    # inputs need to be in default dim_order (contiguous memory format)
-    meta = node.meta["val"]
-    if meta.dim_order() != tuple(range(meta.dim())):
-        raise RuntimeError(
-            f"Arm backend only supports contiguous memory format for inputs. "
-            f"Expected dim_order: {tuple(range(meta.dim()))}, but got: {meta.dim_order()} for node {node.name}"
-        )
     try:
         tosa_arg = TosaArg(node, tosa_spec)
     except ValueError as e:
@@ -113,16 +106,28 @@ def process_inputs_to_parameters(
         ) from e
     parameter_data = get_param(edge_program, node)
 
-    assert isinstance(parameter_data, torch.Tensor), "Expect Attr to be tensor"
+    if not isinstance(parameter_data, torch.Tensor):
+        raise TypeError(
+            f"Expected parameter '{node.name}' to be a torch.Tensor, got "
+            f"{type(parameter_data).__name__}"
+        )
     parameter_values = parameter_data.detach().numpy()
 
     if tosa_arg.dtype == torch.float32:
-        assert tosa_spec.support_float(), f"{tosa_spec} doesn't support float"
+        if not tosa_spec.support_float():
+            raise ValueError(f"{tosa_spec} doesn't support float operations")
+
+    # Handle special case for INT48 tensors
+    special_type = node.meta.get(TosaSpecialDtype.meta_key(), None)
+    if isinstance(special_type, TosaSpecialDtype):
+        tosa_dtype = special_type.get_tosa_dtype()
+    else:
+        tosa_dtype = tosa_arg.dtype
 
     parameter_values = np.transpose(parameter_values, tosa_arg.dim_order)
 
     tosa_graph.addConst(
-        parameter_values.shape, tosa_arg.dtype, parameter_values, name=tosa_arg.name
+        parameter_values.shape, tosa_dtype, parameter_values, name=tosa_arg.name
     )
 
 
@@ -142,7 +147,11 @@ def process_inputs_to_buffers(
         ) from e
     buffer_data = get_buffer(edge_program, node)
 
-    assert isinstance(buffer_data, torch.Tensor), "Expect Attr to be tensor"
+    if not isinstance(buffer_data, torch.Tensor):
+        raise TypeError(
+            f"Expected buffer '{node.name}' to be a torch.Tensor, got "
+            f"{type(buffer_data).__name__}"
+        )
     buffer_values = buffer_data.detach().numpy()
 
     # TODO: fragile code for temporary fix
@@ -183,8 +192,12 @@ def process_placeholder(
     tosa_spec: TosaSpecification,
 ):
     """Wrapper for processing and serializing all types of placeholders"""
-    assert node.name == node.target, "Expect placeholder name and target to match"
-    assert 0 == len(node.args), "Can't handle default input values"
+    if node.name != node.target:
+        raise ValueError(
+            f"Placeholder name '{node.name}' does not match target '{node.target}'"
+        )
+    if len(node.args) != 0:
+        raise ValueError(f"Placeholder '{node.name}' must not have default values")
 
     if node.name in edge_program.graph_signature.user_inputs:
         process_inputs(node, tosa_graph, tosa_spec)
diff --git a/backends/arm/quantizer/__init__.py b/backends/arm/quantizer/__init__.py
index 5cb5c834a98..e36c683416a 100644
--- a/backends/arm/quantizer/__init__.py
+++ b/backends/arm/quantizer/__init__.py
@@ -2,7 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Expose quantizer APIs and load optional quantized kernels.
 
+Import the public quantizer classes and configuration helpers for Arm
+backends. Attempt to load portable and quantized libraries; fall back to a
+log message if unavailable.
+"""
 
 from .quantization_config import QuantizationConfig  # noqa  # usort: skip
 from .arm_quantizer import (  # noqa
diff --git a/backends/arm/quantizer/arm_quantizer_utils.py b/backends/arm/quantizer/arm_quantizer_utils.py
index 838dd44733e..90876386aa6 100644
--- a/backends/arm/quantizer/arm_quantizer_utils.py
+++ b/backends/arm/quantizer/arm_quantizer_utils.py
@@ -6,10 +6,12 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide utilities for quantization annotations.
 
-#
-# Utility functions for TOSAQuantizer
-#
+Use these helpers to check and mark annotation state when working with
+``QuantizationAnnotation`` entries in FX node metadata.
+
+"""
 
 from typing import cast
 
@@ -20,7 +22,15 @@
 
 
 def is_annotated(node: Node) -> bool:
-    """Given a node return whether the node is annotated."""
+    """Return True if the node is annotated.
+
+    Args:
+        node (Node): FX node to inspect.
+
+    Returns:
+        bool: True if ``Q_ANNOTATION_KEY`` exists and ``_annotated`` is set.
+
+    """
     return (
         Q_ANNOTATION_KEY in node.meta
         and cast(QuantizationAnnotation, node.meta[Q_ANNOTATION_KEY])._annotated
@@ -28,7 +38,15 @@ def is_annotated(node: Node) -> bool:
 
 
 def is_output_annotated(node: Node) -> bool:
-    """Given a node, return whether the output of the node is annotated."""
+    """Return True if the node's output is annotated.
+
+    Args:
+        node (Node): FX node to inspect.
+
+    Returns:
+        bool: True if annotated and an output qspec is present.
+
+    """
     if Q_ANNOTATION_KEY in node.meta:
         annotation = cast(QuantizationAnnotation, node.meta[Q_ANNOTATION_KEY])
         return annotation._annotated and annotation.output_qspec is not None
@@ -37,8 +55,14 @@ def is_output_annotated(node: Node) -> bool:
 
 
 def mark_node_as_annotated(node: Node) -> None:
-    """Marks node as annotated. If needed, an empty  QuantizationAnnotation is added
-    to the quantization_annotation node meta entry.
+    """Mark a node as annotated.
+
+    Create an empty ``QuantizationAnnotation`` on the node when missing and set
+    its ``_annotated`` flag to True.
+
+    Args:
+        node (Node): FX node to update.
+
     """
     if Q_ANNOTATION_KEY not in node.meta:
         node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation()
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
index ff1ad50e517..349aa3e6b21 100644
--- a/backends/arm/quantizer/quantization_annotator.py
+++ b/backends/arm/quantizer/quantization_annotator.py
@@ -6,7 +6,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Callable, List, Optional, Sequence
+from typing import Callable, cast, List, Optional, Sequence
 
 import torch
 import torch.fx
@@ -137,11 +137,18 @@ def _is_large_scalar(node: Node, gm: torch.fx.GraphModule):
     node since histc op (in HistogramObserver) only works for values up to certain upper
     bound.
     """
+    HISTC_UPPER_BOUND = 3.4028235e15
     if node.op == "get_attr" and isinstance(node.target, str):
         tensor = _get_node_target(gm, node.target)
         # torch.histc works until this upper bound
-        HISTC_UPPER_BOUND = 3.4028235e15
         return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+    if node.op == "call_function" and node.target in (
+        torch.ops.aten.full.default,
+        torch.ops.aten.full,
+        torch.ops.aten.fill_.Scalar,
+    ):
+        fill_value = cast(float, node.args[1])
+        return abs(fill_value) > HISTC_UPPER_BOUND
     return False
 
 
@@ -358,13 +365,13 @@ def _match_pattern(
     torch.ops.aten.permute_copy.default,
     torch.ops.aten.avg_pool2d.default,
     torch.ops.aten.max_pool2d.default,
-    torch.ops.aten.full.default,
-    torch.ops.aten.full,
     torch.ops.aten.flatten.using_ints,
     torch.ops.aten.dropout.default,
     torch.ops.aten.dropout_.default,
     torch.ops.aten.adaptive_avg_pool2d.default,
     torch.ops.aten.alias_copy.default,
+    torch.ops.aten.pixel_shuffle.default,
+    torch.ops.aten.pixel_unshuffle.default,
 ]
 
 
@@ -391,7 +398,11 @@ def any_or_hardtanh_min_zero(n: Node):
                 torch.ops.aten.conv2d.padding,
             ],
             [torch.ops.aten.batch_norm.default, F.batch_norm],
-            [torch.ops.aten.relu.default, torch.ops.aten.hardtanh.default],
+            [
+                torch.ops.aten.relu.default,
+                torch.ops.aten.relu_.default,
+                torch.ops.aten.hardtanh.default,
+            ],
         ],
         filter_fn=any_or_hardtanh_min_zero,
     ):
@@ -407,6 +418,7 @@ def any_or_hardtanh_min_zero(n: Node):
             ]
         elif node.target in (
             torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
             torch.ops.aten.hardtanh.default,
         ):
             quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
@@ -443,7 +455,11 @@ def any_or_hardtanh_min_zero(n: Node):
                 torch.ops.aten.linear.default,
                 torch.ops.aten.conv2d.padding,
             ],
-            [torch.ops.aten.relu.default, torch.ops.aten.hardtanh.default],
+            [
+                torch.ops.aten.relu.default,
+                torch.ops.aten.relu_.default,
+                torch.ops.aten.hardtanh.default,
+            ],
         ],
         any_or_hardtanh_min_zero,
     ):
@@ -508,9 +524,6 @@ def any_or_hardtanh_min_zero(n: Node):
         ]
         quant_properties.quant_output = _QuantProperty(0, shared_qspec)  # type: ignore[arg-type]
     elif node.target in _one_to_one_shared_input_or_input_act_qspec:
-        if not isinstance(node.args[0], Node):
-            return None
-
         input_qspec = (
             SharedQuantizationSpec(node.args[0])  # type: ignore[arg-type]
             if is_output_annotated(node.args[0])  # type: ignore
@@ -568,7 +581,12 @@ def any_or_hardtanh_min_zero(n: Node):
             ),
         ]
         quant_properties.quant_output = None
-    elif node.target in [torch.ops.aten.scalar_tensor.default]:
+    elif node.target in [
+        torch.ops.aten.scalar_tensor.default,
+        torch.ops.aten.full.default,
+        torch.ops.aten.full,
+        torch.ops.aten.fill_.Scalar,
+    ]:
         quant_properties.quant_inputs = []
         quant_properties.quant_output = _QuantProperty(0, output_act_qspec)
     elif node.target in [operator.getitem]:
@@ -625,6 +643,7 @@ def annotate_graph(  # type: ignore[return]
             torch.ops.aten.full_like.default,
             torch.ops.aten.full.default,
             torch.ops.aten.full,
+            torch.ops.aten.fill_.Scalar,
             torch.ops.aten.scalar_tensor.default,
         ]:
             node.kwargs = {}
diff --git a/backends/arm/quantizer/quantization_config.py b/backends/arm/quantizer/quantization_config.py
index d5c3aab1060..7495ff22ac6 100644
--- a/backends/arm/quantizer/quantization_config.py
+++ b/backends/arm/quantizer/quantization_config.py
@@ -3,6 +3,13 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Provide quantization configuration helpers for the Arm backend.
+
+Define a small dataclass to carry activation/weight/bias specs and helper
+accessors that validate specs before use. Use this module to build and validate
+quantization specs consumed by the annotator.
+
+"""
 
 # pyre-unsafe
 
@@ -19,13 +26,38 @@
 
 @dataclass(eq=True, frozen=True)
 class QuantizationConfig:
+    """Provide a container for quantization specs.
+
+    Hold optional specs for input/output activations, weights, and bias, and
+    expose validated accessors.
+
+    Attributes:
+        input_activation (QuantizationSpec | None): Spec for input activations.
+        output_activation (QuantizationSpec | None): Spec for output activations.
+        weight (QuantizationSpec | None): Spec for weights.
+        bias (QuantizationSpec | None): Spec for bias values.
+
+    """
+
     input_activation: QuantizationSpec | None
     output_activation: QuantizationSpec | None
     weight: QuantizationSpec | None
     bias: QuantizationSpec | None
 
     def get_input_act_qspec(self) -> QuantizationSpec | None:
-        """Returns QuantizationSpec 'input_activation' after asserting that input_activation.qscheme is valid."""
+        """Get the validated input activation spec.
+
+        Validate that the input activation qscheme is supported before
+        returning the spec.
+
+        Returns:
+            QuantizationSpec | None: Input activation spec, or ``None`` when
+                unset.
+
+        Raises:
+            ValueError: If the qscheme is not per-tensor affine or symmetric.
+
+        """
         if self.input_activation is None:
             return None
         # Validate that input_activation uses a supported qscheme
@@ -39,7 +71,19 @@ def get_input_act_qspec(self) -> QuantizationSpec | None:
         return self.input_activation
 
     def get_output_act_qspec(self) -> QuantizationSpec | None:
-        """Returns QuantizationSpec 'output_activation' after asserting that output_activation.qscheme is valid."""
+        """Get the validated output activation spec.
+
+        Validate that the output activation qscheme is supported before
+        returning the spec.
+
+        Returns:
+            QuantizationSpec | None: Output activation spec, or ``None`` when
+                unset.
+
+        Raises:
+            ValueError: If the qscheme is not per-tensor affine or symmetric.
+
+        """
         if self.output_activation is None:
             return None
         # Validate that output_activation uses a supported qscheme
@@ -53,7 +97,18 @@ def get_output_act_qspec(self) -> QuantizationSpec | None:
         return self.output_activation
 
     def get_weight_qspec(self) -> QuantizationSpec | None:
-        """Returns QuantizationSpec 'weight' after asserting that weight.qscheme is valid."""
+        """Get the validated weight spec.
+
+        Validate that the weight qscheme is supported (per-tensor or
+        per-channel symmetric) before returning the spec.
+
+        Returns:
+            QuantizationSpec | None: Weight spec, or ``None`` when unset.
+
+        Raises:
+            ValueError: If the qscheme is not a supported symmetric scheme.
+
+        """
         if self.weight is None:
             return None
         # Validate that weight uses a supported qscheme
@@ -65,11 +120,46 @@ def get_weight_qspec(self) -> QuantizationSpec | None:
         return self.weight
 
     def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None:
-        """Returns QuantizationSpec 'bias' after asserting that bias.dtype is torch.float."""
+        """Get the derived or validated bias spec.
+
+        For conv/linear ops, derive bias qparams from the input/weight observers.
+        Otherwise, validate a user-provided floating-point bias spec.
+
+        Args:
+            node (torch.fx.Node): Node whose bias spec is requested.
+
+        Returns:
+            QuantizationSpec | None: Derived or provided bias spec, or ``None``
+                when unset.
+
+        Raises:
+            ValueError: If deriving qparams sees an unexpected number of
+                observers/fake-quantizers, or if a provided bias dtype is not
+                floating-point.
+
+        """
 
         def _derive_qparams_fn(
             obs_or_fqs: list[ObserverOrFakeQuantize],
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            """Compute bias scale/zero-point from activation/weight observers.
+
+            Expect two observers or fake-quantize modules: one for the input
+            activation and one for the weight. The bias scale is the product of
+            input and weight scales, and the zero-point is a tensor of zeros.
+
+            Args:
+                obs_or_fqs (list[ObserverOrFakeQuantize]): Observers/fake-quant
+                    in order ``[act, weight]``.
+
+            Returns:
+                Tuple[torch.Tensor, torch.Tensor]: Bias scale tensor and
+                    integer zero-point tensor.
+
+            Raises:
+                ValueError: If the list does not contain exactly two items.
+
+            """
             # Validate expected number of observers/fake-quantizes
             if len(obs_or_fqs) != 2:
                 raise ValueError(
@@ -89,29 +179,48 @@ def _derive_qparams_fn(
             torch.ops.aten.linear.default,
             torch.ops.aten.conv2d.padding,
         ]:
-            input_act = node.args[0]
-            weight = node.args[1]
-            # If the weights are quantized per_tensor, do the same with bias
-            qscheme = (
-                torch.per_tensor_symmetric
-                if self.weight is None
-                else self.weight.qscheme
-            )
-            ch_axis = None
-            if self.weight is not None:
-                if qscheme == torch.per_channel_symmetric:
-                    ch_axis = self.weight.ch_axis
-
-            quantization_spec = DerivedQuantizationSpec(
-                derived_from=[(input_act, node), (weight, node)],  # type: ignore[list-item]
-                derive_qparams_fn=_derive_qparams_fn,
-                dtype=torch.int32,
-                quant_min=torch.iinfo(torch.int32).min,
-                quant_max=torch.iinfo(torch.int32).max - 1,
-                qscheme=qscheme,
-                ch_axis=ch_axis,
-            )
-            return quantization_spec  # type: ignore[return-value]
+            if self.input_activation is None or self.weight is None:
+                raise ValueError(
+                    "Input activation and weight QuantizationConfig must be specified."
+                )
+            if self.input_activation.dtype == self.weight.dtype == torch.int8:
+                # This is the default int8 quantization which uses the derived quantization
+                # calculated from the activation and weight scale
+                input_act = node.args[0]
+                weight = node.args[1]
+
+                # If the weights are quantized per_tensor, do the same with bias
+                qscheme = (
+                    torch.per_tensor_symmetric
+                    if self.weight is None
+                    else self.weight.qscheme
+                )
+                ch_axis = None
+                if self.weight is not None:
+                    if qscheme == torch.per_channel_symmetric:
+                        ch_axis = self.weight.ch_axis
+
+                quantization_spec = DerivedQuantizationSpec(
+                    derived_from=[(input_act, node), (weight, node)],  # type: ignore[list-item]
+                    derive_qparams_fn=_derive_qparams_fn,
+                    dtype=torch.int32,
+                    quant_min=torch.iinfo(torch.int32).min,
+                    quant_max=torch.iinfo(torch.int32).max - 1,
+                    qscheme=qscheme,
+                    ch_axis=ch_axis,
+                )
+                return quantization_spec  # type: ignore[return-value]
+            elif (
+                self.input_activation.dtype == torch.int16
+                and self.weight.dtype == torch.int8
+            ):
+                # In case the activation is quantized to int16, the bias needs to be
+                # added after the convolution, so use the output quantization for this case.
+                return self.output_activation
+            else:
+                raise NotImplementedError(
+                    f"Bias quantization of types: i:{self.input_activation.dtype}, w:{self.weight.dtype} not implemented"
+                )
 
         if self.bias is None:
             return None
diff --git a/backends/arm/requirements-arm-ethos-u.txt b/backends/arm/requirements-arm-ethos-u.txt
index a26fb014234..9076aa08852 100644
--- a/backends/arm/requirements-arm-ethos-u.txt
+++ b/backends/arm/requirements-arm-ethos-u.txt
@@ -3,4 +3,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-ethos-u-vela == 4.4.0
+ethos-u-vela == 4.4.1
\ No newline at end of file
diff --git a/backends/arm/requirements-arm-tosa.txt b/backends/arm/requirements-arm-tosa.txt
index 0f9c2f702a4..16aa01a6c23 100644
--- a/backends/arm/requirements-arm-tosa.txt
+++ b/backends/arm/requirements-arm-tosa.txt
@@ -8,4 +8,4 @@ flatbuffers == 24.3.25
 tosa-adapter-model-explorer == 0.0.1
 ai-edge-model-explorer >= 0.1.16
 
-tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.0
+tosa-tools @ git+https://git.gitlab.arm.com/tosa/tosa-reference-model.git@v2025.07.1
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index 8f63569eece..08589c34c69 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -249,15 +249,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
             handles.inputs->io[i].elem_size);
         return Error::InvalidProgram;
       }
-      supported = executorch::runtime::is_contiguous_dim_order(
-          tensor_in.dim_order().data(), tensor_in.dim());
-      if (!supported) {
-        ET_LOG(
-            Error,
-            "Input %d expected contiguous dim_order, but got non-contiguous dim_order",
-            i);
-        return Error::InvalidProgram;
-      }
 
       // Select a compatible copy routine including checking for input layouts
       // which require permutation.
diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp
index abb4c50d8be..fa8c7ead220 100644
--- a/backends/arm/runtime/VGFSetup.cpp
+++ b/backends/arm/runtime/VGFSetup.cpp
@@ -24,6 +24,13 @@ namespace vgf {
 /* static function to map format to byte count */
 static uint32_t get_format_size(VkFormat format);
 
+// SPV_ARM_tensor does not support rank-0 representations according to the spec.
+// Use an unsqueezed dimension when the resource table contains an empty
+// shape. Tensors are output as rank 0 when copied back from the vgf backend.
+namespace {
+constexpr int64_t kScalarSentinelDimension = 1;
+}
+
 // Debug function to inspect memory properties
 static string memory_flags_to_string(VkMemoryPropertyFlags flags) {
   if (flags == 0)
@@ -264,7 +271,11 @@ static void debug_print_resources(
             the_shape.size(),
             the_stride.size());
         for (int j = 0; j < the_shape.size(); j++) {
-          ET_LOG(Info, "      %d: dim %ld", j, the_shape[j]);
+          ET_LOG(
+              Info,
+              "      %d: dim %lld",
+              j,
+              static_cast<long long>(the_shape[j]));
         }
         // Allocate a tensor with bound memory
         break;
@@ -387,6 +398,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
     // Get tensor shape and strides
     auto shape = resource_decoder->getTensorShape(i);
     auto stride = resource_decoder->getTensorStride(i);
+    const auto shape_size = shape.size();
 
     switch (resource_decoder->getCategory(i)) {
       case vgflib::ResourceCategory::INPUT:
@@ -409,9 +421,9 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
         result = allocate_tensor(
             vk_physical,
             vk_device,
-            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
-            static_cast<uint32_t>(shape.size()),
-            shape.begin(),
+            resource_format,
+            shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+            shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
             static_cast<uint32_t>(stride.size()),
             stride.begin(),
             &tensor_description,
@@ -422,8 +434,7 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
           ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
           return false;
         }
-        size_t e_size = get_format_size(
-            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)));
+        size_t e_size = get_format_size(resource_format);
         if (0 == e_size) {
           ET_LOG(Error, "failed to get element size of VkFormat");
           return false;
@@ -449,9 +460,11 @@ bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
             .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
             .pNext = nullptr,
             .tiling = VK_TENSOR_TILING_LINEAR_ARM,
-            .format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
-            .dimensionCount = static_cast<uint32_t>(shape.size()),
-            .pDimensions = shape.begin(),
+            .format = resource_format,
+            .dimensionCount =
+                shape_size == 0 ? 1 : static_cast<uint32_t>(shape_size),
+            .pDimensions =
+                shape_size == 0 ? &kScalarSentinelDimension : shape.begin(),
             // Note: stride_data of 0's causes size==0, null means stride==size
             .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
             .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
diff --git a/backends/arm/scripts/TOSA_minimal_example.ipynb b/backends/arm/scripts/TOSA_minimal_example.ipynb
index b79780c6a07..a249f03a873 100644
--- a/backends/arm/scripts/TOSA_minimal_example.ipynb
+++ b/backends/arm/scripts/TOSA_minimal_example.ipynb
@@ -62,7 +62,7 @@
     "model = Add()\n",
     "model = model.eval()\n",
     "exported_program = torch.export.export(model, example_inputs)\n",
-    "graph_module = exported_program.module()\n",
+    "graph_module = exported_program.graph_module\n",
     "\n",
     "_ = graph_module.print_readable()"
    ]
@@ -201,7 +201,7 @@
     "            config=ExecutorchBackendConfig(extract_delegate_segments=False)\n",
     "        )\n",
     "\n",
-    "executorch_program_manager.exported_program().module().print_readable()\n",
+    "executorch_program_manager.exported_program().graph_module.print_readable()\n",
     "\n",
     "# Save pte file\n",
     "pte_name = base_name + \".pte\"\n",
diff --git a/backends/arm/scripts/build_executor_runner_vkml.sh b/backends/arm/scripts/build_executor_runner_vkml.sh
index 1df63acc425..afca02c6299 100755
--- a/backends/arm/scripts/build_executor_runner_vkml.sh
+++ b/backends/arm/scripts/build_executor_runner_vkml.sh
@@ -69,6 +69,7 @@ cmake \
     -DCMAKE_BUILD_TYPE=${build_type}            \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
diff --git a/backends/arm/scripts/mlsdk_utils.sh b/backends/arm/scripts/mlsdk_utils.sh
index 7a7d2585e52..2c6553df3d3 100755
--- a/backends/arm/scripts/mlsdk_utils.sh
+++ b/backends/arm/scripts/mlsdk_utils.sh
@@ -38,6 +38,28 @@ function download_ai_mlsdk_manifest() {
                --manifest-url ${mlsdk_manifest_url} \
                --manifest-branch ${mlsdk_manifest_tag} \
                -g model-converter,emulation-layer,vgf-library
+
+# Update dependencies to use gitlab tosa-mlir-translator
+# Do not indent the xml. Heredoc indentation is significant.
+mkdir -p .repo/local_manifests/
+cat > ".repo/local_manifests/tosa_gitlab.xml" <<'XML'
+<manifest>
+  <remote name="gitlab" fetch="https://git.gitlab.arm.com/"/>
+
+  <!-- remove the mlplatform entry -->
+  <remove-project name="tosa/tosa_mlir_translator"/>
+
+  <!-- re-add with GitLab repo and pin the SHA -->
+  <project
+      name="tosa/tosa-mlir-translator"
+      path="dependencies/tosa_mlir_translator"
+      remote="gitlab"
+      revision="refs/tags/v2025.07.1"
+      groups="all model-converter"
+      sync-s="true"/>
+</manifest>
+XML
+
         ./repo sync -j$(nproc)
 
         popd
@@ -109,7 +131,7 @@ function setup_mlsdk() {
             -DSPIRV_TOOLS_PATH=../../dependencies/SPIRV-Tools        \
             -DVULKAN_HEADERS_PATH=../../dependencies/Vulkan-Headers
 
-        cmake --build build
+        cmake --build build -j$(nproc)
         cmake --install build --prefix deploy
         popd
     fi
diff --git a/backends/arm/scripts/parse_test_names.py b/backends/arm/scripts/parse_test_names.py
index c6eaafa597b..54f8aa7421d 100644
--- a/backends/arm/scripts/parse_test_names.py
+++ b/backends/arm/scripts/parse_test_names.py
@@ -26,6 +26,8 @@
     "_native_batch_norm_legit_no_training.default",
     "_native_batch_norm_legit.no_stats",
     "alias_copy.default",
+    "pixel_shuffle.default",
+    "pixel_unshuffle.default",
 ]
 ALL_EDGE_OPS = SAMPLE_INPUT.keys() | CUSTOM_EDGE_OPS
 
@@ -95,6 +97,9 @@ def parse_test_name(
     op = op.removesuffix("_1d")
     op = op.removesuffix("_2d")
 
+    # Remove suffix for 16 bit activation and 8 bit weight test cases
+    op = op.removesuffix("_16a8w")
+
     assert target != "None", f"{test_name} does not contain one of {TARGETS}"
     assert (
         op in op_name_map.keys()
diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh
index 0f76d0496de..5d3088c865a 100755
--- a/backends/arm/scripts/run_fvp.sh
+++ b/backends/arm/scripts/run_fvp.sh
@@ -22,6 +22,7 @@ data_file=""
 target="ethos-u55-128"
 timeout="600"
 etrecord_file=""
+trace_file=""
 
 help() {
     echo "Usage: $(basename $0) [options]"
@@ -31,6 +32,7 @@ help() {
     echo "  --target=<TARGET>        Target to build and run for Default: ${target}"
     echo "  --timeout=<TIME_IN_SEC>  Maximum target runtime, used to detect hanging, might need to be higer on large models Default: ${timeout}"
     echo "  --etrecord=<FILE>        If ETDump is used you can supply a ETRecord file matching the PTE"
+    echo "  --trace_file=<FILE>      File to write PMU trace output to"
     exit 0
 }
 
@@ -42,6 +44,7 @@ for arg in "$@"; do
       --target=*) target="${arg#*=}";;
       --timeout=*) timeout="${arg#*=}";;
       --etrecord=*) etrecord_file="${arg#*=}";;
+      --trace_file=*) trace_file="${arg#*=}";;
       *)
       ;;
     esac
@@ -86,6 +89,14 @@ fi
 
 log_file=$(mktemp)
 
+extra_args_u55=()
+extra_args_u85=()
+
+if [[ -n "${trace_file}" ]]; then
+    extra_args_u55+=(-C "ethosu.extra_args=--pmu-trace ${trace_file}")
+    extra_args_u85+=(-C "mps4_board.subsystem.ethosu.extra_args=--pmu-trace ${trace_file}")
+fi
+
 if [[ ${target} == *"ethos-u55"*  ]]; then
     ${nobuf} ${fvp_model}                                   \
         -C ethosu.num_macs=${num_macs}                      \
@@ -93,6 +104,7 @@ if [[ ${target} == *"ethos-u55"*  ]]; then
         -C mps3_board.telnetterminal0.start_telnet=0        \
         -C mps3_board.uart0.out_file='-'                    \
         -C mps3_board.uart0.shutdown_on_eot=1               \
+        "${extra_args_u55[@]}"                              \
         -a "${elf_file}"                                    \
         ${data_file}                                        \
         --timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true # seconds
@@ -105,6 +117,7 @@ elif [[ ${target} == *"ethos-u85"*  ]]; then
         -C mps4_board.telnetterminal0.start_telnet=0        \
         -C mps4_board.uart0.out_file='-'                    \
         -C mps4_board.uart0.shutdown_on_eot=1               \
+        "${extra_args_u85[@]}"                              \
         -a "${elf_file}"                                    \
         ${data_file}                                        \
         --timelimit ${timeout} 2>&1 | sed 's/\r$//' | tee ${log_file} || true # seconds
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
index ec35b63f8f6..fd7d894fbf0 100644
--- a/backends/arm/test/TARGETS
+++ b/backends/arm/test/TARGETS
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_arm_tests")
 
@@ -58,6 +63,7 @@ runtime.python_library(
         "//executorch/backends/arm/quantizer:lib",
         "//executorch/backends/arm/tosa:mapping",
         "//executorch/backends/arm:vgf",
+        "//executorch/backends/arm:_factory",
         "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/exir/backend:operator_support",
         "fbsource//third-party/pypi/tabulate:tabulate",
diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 963084d6091..3b5dd8bd4db 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -14,6 +14,7 @@
 
 import pytest
 from executorch.backends.arm.ethosu import EthosUCompileSpec
+
 from executorch.backends.arm.test.runner_utils import (
     arm_executor_runner_exists,
     corstone300_installed,
@@ -226,6 +227,7 @@ def parametrize(
     test_data: dict[str, Any],
     xfails: dict[str, xfail_type] | None = None,
     strict: bool = True,
+    flakies: dict[str, int] | None = None,
 ):
     """
     Custom version of pytest.mark.parametrize with some syntatic sugar and added xfail functionality
@@ -236,12 +238,17 @@ def parametrize(
     """
     if xfails is None:
         xfails = {}
+    if flakies is None:
+        flakies = {}
 
     def decorator_func(func):
         """Test data is transformed from a dict of (id, data) pairs to a list of pytest params to work with the native pytests parametrize function"""
         pytest_testsuite = []
         for id, test_parameters in test_data.items():
-            if id in xfails:
+            if id in flakies:
+                # Mark this parameter as flaky with given reruns
+                marker = (pytest.mark.flaky(reruns=flakies[id]),)
+            elif id in xfails:
                 xfail_info = xfails[id]
                 reason = ""
                 raises = None
diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
index 6fc9e7e5adc..0060bf0ea63 100644
--- a/backends/arm/test/conftest.py
+++ b/backends/arm/test/conftest.py
@@ -118,7 +118,7 @@ def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool:
       a RuntimeError instead of returning False.
     """
 
-    if option in pytest._test_options and pytest._test_options[option]:  # type: ignore[attr-defined]
+    if hasattr(pytest, "_test_options") and option in pytest._test_options and pytest._test_options[option]:  # type: ignore[attr-defined]
         return True
     else:
         if fail_if_not_enabled:
diff --git a/backends/arm/test/misc/test_conv_relu_residual_add.py b/backends/arm/test/misc/test_conv_relu_residual_add.py
index fdd6ec972a6..d88a9c74b7c 100644
--- a/backends/arm/test/misc/test_conv_relu_residual_add.py
+++ b/backends/arm/test/misc/test_conv_relu_residual_add.py
@@ -85,7 +85,6 @@ def test_tosa_u55_INT(per_channel_quantization):
         model_inputs,
         [],
         [],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
         qtol=0,
@@ -102,7 +101,6 @@ def test_tosa_u85_INT(per_channel_quantization):
         model_inputs,
         [],
         [],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
         qtol=0,
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 3796d3dce4a..c2f28f4e9d8 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -262,9 +262,10 @@ def forward(self, x):
 
 
 @common.parametrize("test_data", Add.inputs)
+@common.XfailIfNoCorstone300
 def test_fail_dump_tosa_ops(caplog, test_data: input_t1):
     pipeline = EthosU55PipelineINT[input_t1](
-        Add(), test_data, [], [], use_to_edge_transform_and_lower=True, run_on_fvp=False
+        Add(), test_data, [], [], use_to_edge_transform_and_lower=True
     )
     pipeline.dump_operator_distribution("to_edge_transform_and_lower")
     pipeline.run()
diff --git a/backends/arm/test/misc/test_dim_order.py b/backends/arm/test/misc/test_dim_order.py
new file mode 100644
index 00000000000..6b0b79add99
--- /dev/null
+++ b/backends/arm/test/misc/test_dim_order.py
@@ -0,0 +1,123 @@
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+)
+
+
+input_t1 = Tuple[torch.Tensor]  # Input x
+
+
+class ChannelsLastInput(torch.nn.Module):
+    """
+    Test a complex case with (channels last, channels first) input,
+    and  (channels first, channels last) output.
+    """
+
+    inputs: input_t1 = (
+        torch.arange(1, 25, dtype=torch.float32)
+        .reshape((1, 2, 3, 4))
+        .to(memory_format=torch.channels_last),
+        torch.arange(1, 25, dtype=torch.float32).reshape((1, 2, 3, 4)),
+    )
+
+    def forward(self, x, y):
+        x = x * x
+        return y, x
+
+
+class ChannelsFirstOutput(torch.nn.Module):
+    """
+    Test coverting to channels_first inside the delegate.
+    """
+
+    inputs: input_t1 = (
+        torch.arange(1, 25, dtype=torch.float32)
+        .reshape((1, 2, 3, 4))
+        .to(memory_format=torch.channels_last),
+    )
+
+    def forward(self, x):
+        x = x.clone(memory_format=torch.contiguous_format) * x
+        return x
+
+
+class ChannelsLastOutput(torch.nn.Module):
+    """
+    Test changing of dim_order inside the delegate.
+    """
+
+    inputs: input_t1 = (torch.arange(1, 9, dtype=torch.float32).reshape((1, 2, 2, 2)),)
+
+    def forward(self, x):
+        x = x * x
+        x = x.clone(memory_format=torch.channels_last)
+        return x
+
+
+class ChannelsLastInsidePartition(torch.nn.Module):
+    """
+    Test dim_order changes inside the partiton, but no dim_order changes at input/output.
+    """
+
+    inputs: input_t1 = (torch.randn((1, 2, 3, 3)),)
+
+    def __init__(self):
+        super().__init__()
+        self.conv2d = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=(3, 3))
+
+    def forward(self, x):
+        return (
+            self.conv2d(x.clone(memory_format=torch.channels_last)).clone(
+                memory_format=torch.contiguous_format
+            )
+            * 1
+        )
+
+
+test_modules = {
+    "channels_last_input": ChannelsLastInput,
+    "channels_first_output": ChannelsFirstOutput,
+    "channels_last_output": ChannelsLastOutput,
+    "channels_last_inside_partition": ChannelsLastInsidePartition,
+}
+
+
+@common.parametrize("module", test_modules)
+def test_dim_order_tosa_FP(module):
+    pipeline = TosaPipelineFP[input_t1](module(), module.inputs, [])
+    pipeline.run()
+
+
+@common.parametrize("module", test_modules)
+def test_dim_order_tosa_INT(module):
+    pipeline = TosaPipelineINT[input_t1](
+        module(), module.inputs, [], symmetric_io_quantization=True
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("module", test_modules)
+def test_dim_order_u55_INT(module):
+    pipeline = EthosU55PipelineINT[input_t1](module(), module.inputs, [])
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("module", test_modules)
+def test_dim_order_u85_INT(module):
+    pipeline = EthosU85PipelineINT[input_t1](module(), module.inputs, [])
+    pipeline.run()
diff --git a/backends/arm/test/misc/test_dim_order_guards.py b/backends/arm/test/misc/test_dim_order_guards.py
deleted file mode 100644
index 80a3c014abc..00000000000
--- a/backends/arm/test/misc/test_dim_order_guards.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2024-2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from typing import Tuple
-
-import pytest
-
-import torch
-from executorch.backends.arm.test import common
-
-from executorch.backends.arm.test.tester.test_pipeline import (
-    TosaPipelineFP,
-    TosaPipelineINT,
-)
-
-
-input_t1 = Tuple[torch.Tensor]  # Input x
-
-
-class Conv2D(torch.nn.Module):
-    inputs: dict[str, input_t1] = {
-        "randn": (torch.randn(1, 2, 20, 20).to(memory_format=torch.channels_last),),
-    }
-
-    def __init__(self):
-        super().__init__()
-        self.conv2d = torch.nn.Conv2d(in_channels=2, out_channels=3, kernel_size=(3, 3))
-
-    def forward(self, x):
-        return self.conv2d(x)
-
-
-@common.parametrize("test_data", Conv2D.inputs)
-def test_tosa_FP_pipeline(test_data: input_t1):
-    module = Conv2D()
-    pipeline = TosaPipelineFP[input_t1](
-        module,
-        test_data,
-        [],
-        [],
-        use_to_edge_transform_and_lower=False,
-    )
-    pos = pipeline.find_pos("partition")
-    pipeline._stages = pipeline._stages[:pos]
-    pipeline.run()
-    with pytest.raises(RuntimeError):
-        pipeline.tester.partition()
-
-
-@common.parametrize("test_data", Conv2D.inputs)
-def test_tosa_INT_pipeline(test_data: input_t1):
-    module = Conv2D()
-    pipeline = TosaPipelineINT[input_t1](
-        module,
-        test_data,
-        [],
-        [],
-        use_to_edge_transform_and_lower=False,
-    )
-    pos = pipeline.find_pos("partition")
-    pipeline._stages = pipeline._stages[:pos]
-    pipeline.run()
-    with pytest.raises(RuntimeError):
-        pipeline.tester.partition()
diff --git a/backends/arm/test/misc/test_multiple_delegates.py b/backends/arm/test/misc/test_multiple_delegates.py
index f716bc45385..8dad25f4180 100644
--- a/backends/arm/test/misc/test_multiple_delegates.py
+++ b/backends/arm/test/misc/test_multiple_delegates.py
@@ -23,7 +23,7 @@ class MultipleDelegatesModule(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
         z = x + y
-        s = torch.tan(z)
+        s = torch.max(z)
         return s * z
 
 
diff --git a/backends/arm/test/misc/test_pass_required_order.py b/backends/arm/test/misc/test_pass_required_order.py
new file mode 100644
index 00000000000..2745d25a498
--- /dev/null
+++ b/backends/arm/test/misc/test_pass_required_order.py
@@ -0,0 +1,95 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import re
+from typing import List, Set, Type
+
+import pytest
+from executorch.backends.arm._passes.arm_pass_manager import ArmPass, ArmPassManager
+from executorch.backends.arm.tosa.specification import TosaSpecification
+from executorch.exir.pass_base import ExportPass
+
+
+class PassC(ArmPass):
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+
+class PassB(ArmPass):
+    _passes_required_after = {PassC}
+
+
+class PassA(ArmPass):
+    _passes_required_after = {PassB, PassC}
+
+
+class IndependentPass(ArmPass):
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
+
+def _setup_pass_manager(passes: List[ArmPass] | None = None):
+    tosa_spec = TosaSpecification.create_from_string("TOSA-1.00+INT")
+    pass_manager = ArmPassManager(tosa_spec)
+    if passes is not None:
+        for p in passes:
+            pass_manager.add_pass(p)
+    return pass_manager
+
+
+def test_no_passes():
+    pass_manager = _setup_pass_manager()
+    pass_manager.validate_constraints_mandatory()
+
+
+def test_correct_order():
+    pass_manager = _setup_pass_manager([PassA(), PassB(), PassC()])
+    pass_manager.validate_constraints_mandatory()
+
+
+def test_run_pass_twice():
+    pass_manager = _setup_pass_manager([PassA(), PassB(), PassB(), PassC()])
+    pass_manager.validate_constraints_mandatory()
+
+
+def test_independent_pass():
+    pass_manager = _setup_pass_manager(
+        [
+            IndependentPass(),
+            PassA(),
+            IndependentPass(),
+            PassB(),
+            IndependentPass(),
+            PassC(),
+            IndependentPass(),
+        ]
+    )
+    pass_manager.validate_constraints_mandatory()
+
+
+def test_duplicated_requiring_pass_put_last():
+    error_msg = """The following constraints for passes are not met:
+  - PassC must run after PassB
+"""
+    pass_manager = _setup_pass_manager([PassA(), PassB(), PassC(), PassB()])
+    with pytest.raises(RuntimeError, match=re.escape(error_msg)):
+        pass_manager.validate_constraints_mandatory()
+
+
+def test_two_passes_wrong_order():
+    error_msg = """The following constraints for passes are not met:
+  - PassC must run after PassB
+"""
+    pass_manager = _setup_pass_manager([PassC(), PassB()])
+    with pytest.raises(RuntimeError, match=re.escape(error_msg)):
+        pass_manager.validate_constraints_mandatory()
+
+
+def test_missing_passes():
+    error_msg = """The following constraints for passes are not met:
+  - PassC must run after PassA
+  - PassC must run after PassB
+"""
+    pass_manager = _setup_pass_manager([PassA(), PassB()])
+    with pytest.raises(RuntimeError, match=re.escape(error_msg)):
+        pass_manager.validate_constraints_mandatory()
diff --git a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
index 0e99f3f5bfa..fad31b57537 100644
--- a/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
+++ b/backends/arm/test/models/stable_diffusion/test_CLIPTextModelWithProjection.py
@@ -4,9 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm._passes import (
     ConvertInt64ConstOpsToInt32Pass,
@@ -18,26 +17,41 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     CLIP_text_encoder_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
 from transformers import CLIPTextModelWithProjection
 
+input_t = Tuple[torch.Tensor]
+
 
-class TestCLIPTextModelWithProjection(unittest.TestCase):
+class TestCLIPTextModelWithProjection:
     """
     Test class of CLIPTextModelWithProjection.
     CLIPTextModelWithProjection is one of the text_encoder used by Stable Diffusion 3.5 Medium
     """
 
-    # Adjust nbr below as we increase op support. Note: most of the delegates
-    # calls are directly consecutive to each other in the .pte. The reason
-    # for that is some assert ops are removed by passes in the
-    # .to_executorch step, i.e. after Arm partitioner.
-    ops_after_partitioner = {
+    # Adjust nbr below as we increase op support.
+    ops_after_partitioner_FP = {
         "executorch_exir_dialects_edge__ops_aten_argmax_default": 1,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
         "torch.ops.higher_order.executorch_call_delegate": 2,
     }
 
+    ops_after_partitioner_INT = {
+        "executorch_exir_dialects_edge__ops_aten_argmax_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_full_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_index_select_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1,
+        "executorch_exir_dialects_edge__ops_aten_where_self": 1,
+        "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
+        "torch.ops.aten.scalar_tensor.default": 1,
+        "torch.ops.higher_order.executorch_call_delegate": 2,
+    }
+
     def _prepare_inputs(
         self,
         batch_size=12,
@@ -61,46 +75,93 @@ def prepare_model_and_inputs(self):
 
         return text_encoder_model, text_encoder_model_inputs
 
-    def test_CLIPTextModelWithProjection_tosa_FP(self):
-        text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    text_encoder_model,
-                    example_inputs=text_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                    transform_passes=[
-                        ConvertInt64ConstOpsToInt32Pass(),
-                        ConvertInt64OutputOpsToInt32Pass(),
-                        InsertInt32CastsAfterInt64PlaceholdersPass(),
-                    ],
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=text_encoder_model_inputs,
-                )
-            )
-
-    @pytest.mark.xfail(raises=AssertionError, reason="Output difference.")
-    def test_CLIPTextModelWithProjection_tosa_INT(self):
-        text_encoder_model, text_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    text_encoder_model,
-                    example_inputs=text_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=text_encoder_model_inputs,
-                )
-            )
+
+def test_CLIPTextModelWithProjection_tosa_FP():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_CLIPTextModelWithProjection_tosa_INT():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            atol=0.8,
+        )
+        pipeline.change_args(
+            "check_count.exir",
+            TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_FP():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            atol=4,  # TODO: Investiage numerical issue: MAX Diff ~50%
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestCLIPTextModelWithProjection.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_CLIPTextModelWithProjection_vgf_INT():
+    text_encoder_model, text_encoder_model_inputs = (
+        TestCLIPTextModelWithProjection().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            text_encoder_model,
+            text_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            atol=0.8,
+        )
+        pipeline.change_args(
+            "check_count.exir",
+            TestCLIPTextModelWithProjection.ops_after_partitioner_INT,
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
index f9d814d044b..9506fe727db 100644
--- a/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_SD3Transformer2DModel.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from diffusers.models.transformers import SD3Transformer2DModel
@@ -13,10 +13,16 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     SD3Transformer2DModel_init_dict,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t4 = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
 
 
-class TestSD3Transformer2DModel(unittest.TestCase):
+class TestSD3Transformer2DModel:
     """
     Test class of AutoenSD3Transformer2DModelcoderKL.
     SD3Transformer2DModel is the transformer model used by Stable Diffusion 3.5 Medium
@@ -24,16 +30,12 @@ class TestSD3Transformer2DModel(unittest.TestCase):
 
     # Adjust nbr below as we increase op support.
     ops_after_partitioner_FP = {
-        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
         "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 1,
         "torch.ops.higher_order.executorch_call_delegate": 1,
     }
 
     ops_after_partitioner_INT = {
-        "executorch_exir_dialects_edge__ops_aten_permute_copy_default": 1,
-        "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2,
         "executorch_exir_dialects_edge__ops_dim_order_ops__to_dim_order_copy_default": 2,
         "torch.ops.higher_order.executorch_call_delegate": 2,
     }
@@ -93,48 +95,88 @@ def forward(self, *args, **kwargs):
 
         return sd35_transformer2D_model, sd35_transformer2D_model_inputs
 
-    def test_SD3Transformer2DModel_tosa_FP(self):
-        sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
-            self.prepare_model_and_inputs()
-        )
-        with torch.no_grad():
-            (
-                ArmTester(
-                    sd35_transformer2D_model,
-                    example_inputs=sd35_transformer2D_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=sd35_transformer2D_model_inputs,
-                    rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
-                    atol=4.0,
-                )
-            )
 
-    def test_SD3Transformer2DModel_tosa_INT(self):
-        sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
-            self.prepare_model_and_inputs()
+def test_SD3Transformer2DModel_tosa_FP():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            atol=4.0,
         )
-        with torch.no_grad():
-            (
-                ArmTester(
-                    sd35_transformer2D_model,
-                    example_inputs=sd35_transformer2D_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=sd35_transformer2D_model_inputs,
-                    qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
-                    rtol=1.0,
-                    atol=4.0,
-                )
-            )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_SD3Transformer2DModel_tosa_INT():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            rtol=1.0,
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_FP():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            rtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_SD3Transformer2DModel_vgf_INT():
+    sd35_transformer2D_model, sd35_transformer2D_model_inputs = (
+        TestSD3Transformer2DModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t4](
+            sd35_transformer2D_model,
+            sd35_transformer2D_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            qtol=1.0,  # TODO: MLETORCH-875: Reduce tolerance of SD3Transformer2DModel with FP and INT
+            rtol=1.0,
+            atol=4.0,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestSD3Transformer2DModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
index 22a47042eb1..20b92e4a258 100644
--- a/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
+++ b/backends/arm/test/models/stable_diffusion/test_T5EncoderModel.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from executorch.backends.arm._passes import (
@@ -17,11 +17,17 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     T5_encoder_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
 from transformers import T5EncoderModel
 
+input_t = Tuple[torch.Tensor]
+
 
-class TestT5EncoderModel(unittest.TestCase):
+class TestT5EncoderModel:
     """
     Test class of T5EncoderModel.
     T5EncoderModel is one of the text_encoder used by Stable Diffusion 3.5 Medium
@@ -61,46 +67,88 @@ def prepare_model_and_inputs(self):
 
         return t5_encoder_model, t5_encoder_model_inputs
 
-    def test_T5EncoderModel_tosa_FP(self):
-        t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    t5_encoder_model,
-                    example_inputs=t5_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                    transform_passes=[
-                        ConvertInt64ConstOpsToInt32Pass(),
-                        ConvertInt64OutputOpsToInt32Pass(),
-                        InsertInt32CastsAfterInt64PlaceholdersPass(),
-                    ],
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_FP)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=t5_encoder_model_inputs,
-                )
-            )
-
-    def test_T5EncoderModel_tosa_INT(self):
-        t5_encoder_model, t5_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    t5_encoder_model,
-                    example_inputs=t5_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .dump_operator_distribution()
-                .check_count(self.ops_after_partitioner_INT)
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=t5_encoder_model_inputs,
-                )
-            )
+
+def test_T5EncoderModel_tosa_FP():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+def test_T5EncoderModel_tosa_INT():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_FP():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+            transform_passes=[
+                ConvertInt64ConstOpsToInt32Pass(),
+                ConvertInt64OutputOpsToInt32Pass(),
+                InsertInt32CastsAfterInt64PlaceholdersPass(),
+            ],
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_FP
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_T5EncoderModel_vgf_INT():
+    t5_encoder_model, t5_encoder_model_inputs = (
+        TestT5EncoderModel().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            t5_encoder_model,
+            t5_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.change_args(
+            "check_count.exir", TestT5EncoderModel.ops_after_partitioner_INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
index ab0f4892fb8..a3c3a018131 100644
--- a/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
+++ b/backends/arm/test/models/stable_diffusion/test_vae_AutoencoderKL.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import unittest
+from typing import Tuple
 
 import torch
 from diffusers.models.autoencoders import AutoencoderKL
@@ -14,10 +14,16 @@
 from executorch.backends.arm.test.models.stable_diffusion.stable_diffusion_module_test_configs import (
     AutoencoderKL_config,
 )
-from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+input_t = Tuple[torch.Tensor]
 
 
-class TestAutoencoderKL(unittest.TestCase):
+class TestAutoencoderKL:
     """
     Test class of AutoencoderKL.
     AutoencoderKL is the encoder/decoder used by Stable Diffusion 3.5 Medium
@@ -41,40 +47,68 @@ def forward(self, *args, **kwargs):
 
         return auto_encoder_model, auto_encoder_model_inputs
 
-    def test_AutoencoderKL_tosa_FP(self):
-        auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    auto_encoder_model,
-                    example_inputs=auto_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
-                )
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=auto_encoder_model_inputs,
-                )
-            )
-
-    def test_AutoencoderKL_tosa_INT(self):
-        auto_encoder_model, auto_encoder_model_inputs = self.prepare_model_and_inputs()
-        with torch.no_grad():
-            (
-                ArmTester(
-                    auto_encoder_model,
-                    example_inputs=auto_encoder_model_inputs,
-                    compile_spec=common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
-                )
-                .quantize()
-                .export()
-                .to_edge_transform_and_lower()
-                .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
-                .to_executorch()
-                .run_method_and_compare_outputs(
-                    inputs=auto_encoder_model_inputs,
-                    atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
-                )
-            )
+
+def test_AutoencoderKL_tosa_FP():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineFP[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.run()
+
+
+def test_AutoencoderKL_tosa_INT():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = TosaPipelineINT[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            use_to_edge_transform_and_lower=True,
+            atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_FP():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+FP",
+            use_to_edge_transform_and_lower=True,
+        )
+        pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+def test_AutoencoderKL_vgf_INT():
+    auto_encoder_model, auto_encoder_model_inputs = (
+        TestAutoencoderKL().prepare_model_and_inputs()
+    )
+    with torch.no_grad():
+        pipeline = VgfPipeline[input_t](
+            auto_encoder_model,
+            auto_encoder_model_inputs,
+            aten_op=[],
+            exir_op=[],
+            tosa_version="TOSA-1.0+INT",
+            use_to_edge_transform_and_lower=True,
+            atol=1.0,  # TODO: MLETORCH-990 Reduce tolerance of vae(AutoencoderKL) with INT
+        )
+        pipeline.run()
diff --git a/backends/arm/test/models/test_conformer.py b/backends/arm/test/models/test_conformer.py
index 3119145aef1..dacf14dc0e7 100644
--- a/backends/arm/test/models/test_conformer.py
+++ b/backends/arm/test/models/test_conformer.py
@@ -92,7 +92,6 @@ def test_conformer_u55_INT():
         aten_ops=TestConformer.aten_ops,
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.change_args(
         "run_method_and_compare_outputs",
@@ -114,7 +113,6 @@ def test_conformer_u85_INT():
         aten_ops=TestConformer.aten_ops,
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.change_args(
         "run_method_and_compare_outputs",
@@ -136,18 +134,9 @@ def test_conformer_vgf_INT():
         exir_op=[],
         tosa_version="TOSA-1.0+INT",
         use_to_edge_transform_and_lower=True,
+        run_on_vulkan_runtime=False,  # TODO: run on vulkan runtime
     )
     pipeline.pop_stage("check_count.exir")
-
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs",
-    #     get_test_inputs(
-    #         TestConformer.dim, TestConformer.lengths, TestConformer.num_examples
-    #     ),
-    #     rtol=1.0,
-    #     atol=3.0,
-    # )
     pipeline.run()
 
 
diff --git a/backends/arm/test/models/test_dl3_arm.py b/backends/arm/test/models/test_dl3_arm.py
index 2000ac34794..c9eab58dda6 100644
--- a/backends/arm/test/models/test_dl3_arm.py
+++ b/backends/arm/test/models/test_dl3_arm.py
@@ -66,7 +66,6 @@ def test_dl3_u55_INT():
         TestDl3.model_example_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.change_args(
         "run_method_and_compare_outputs", rtol=1.0, atol=1.0
@@ -82,7 +81,6 @@ def test_dl3_u85_INT():
         TestDl3.model_example_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.change_args(
         "run_method_and_compare_outputs", rtol=1.0, atol=1.0
@@ -99,11 +97,8 @@ def test_dl3_vgf_INT():
         exir_op=[],
         tosa_version="TOSA-1.0+INT",
         use_to_edge_transform_and_lower=True,
+        run_on_vulkan_runtime=False,  # TODO: run on vulkan runtime
     )
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", rtol=1.0, atol=1.0
-    # )
     pipeline.run()
 
 
@@ -117,8 +112,4 @@ def test_dl3_vgf_FP():
         tosa_version="TOSA-1.0+FP",
         use_to_edge_transform_and_lower=True,
     )
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", rtol=1.0, atol=1.0
-    # )
     pipeline.run()
diff --git a/backends/arm/test/models/test_inception_v3_arm.py b/backends/arm/test/models/test_inception_v3_arm.py
index f973521c1fa..2cb180a87ea 100644
--- a/backends/arm/test/models/test_inception_v3_arm.py
+++ b/backends/arm/test/models/test_inception_v3_arm.py
@@ -66,7 +66,6 @@ def test_ic3_u55_BI():
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         atol=0.6,
         qtol=1,
@@ -83,7 +82,6 @@ def test_ic3_u85_BI():
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         atol=0.6,
         qtol=1,
diff --git a/backends/arm/test/models/test_lstm_arm.py b/backends/arm/test/models/test_lstm_arm.py
index 1e63472f5f4..6ee16b6a31a 100644
--- a/backends/arm/test/models/test_lstm_arm.py
+++ b/backends/arm/test/models/test_lstm_arm.py
@@ -77,7 +77,6 @@ def test_lstm_u55_INT():
         aten_ops=[],
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.change_args(
         "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
@@ -93,7 +92,6 @@ def test_lstm_u85_INT():
         aten_ops=[],
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.change_args(
         "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
@@ -111,10 +109,6 @@ def test_lstm_vgf_INT():
         tosa_version="TOSA-1.0+INT",
         use_to_edge_transform_and_lower=True,
     )
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
-    # )
     pipeline.run()
 
 
@@ -128,8 +122,4 @@ def test_lstm_vgf_FP():
         tosa_version="TOSA-1.0+FP",
         use_to_edge_transform_and_lower=True,
     )
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
-    # )
     pipeline.run()
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index d4e3bbc8e28..f06e1b74bbd 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -46,6 +46,23 @@ def test_mv2_tosa_FP():
     pipeline.run()
 
 
+def test_mv2_tosa_FP_channels_last():
+    input_tensor = model_inputs[0].to(memory_format=torch.channels_last)
+    pipeline = TosaPipelineFP[input_t](
+        mv2,
+        (input_tensor,),
+        aten_op=[],
+        exir_op=[],
+        use_to_edge_transform_and_lower=True,
+    )
+    # Changing memory format leads to an unsupported as_strided_copy op being inserted into the graph,
+    # leading to a graph break.
+    pipeline.change_args(
+        "check_count.exir", {"torch.ops.higher_order.executorch_call_delegate": 2}
+    )
+    pipeline.run()
+
+
 @common.parametrize("per_channel_quantization", quant_test_data)
 def test_mv2_tosa_INT(per_channel_quantization):
     pipeline = TosaPipelineINT[input_t](
@@ -70,7 +87,6 @@ def test_mv2_u55_INT(per_channel_quantization):
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
         atol=0.25,
@@ -88,7 +104,6 @@ def test_mv2_u85_INT(per_channel_quantization):
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
         atol=0.25,
@@ -110,11 +125,8 @@ def test_mv2_vgf_INT(per_channel_quantization):
         per_channel_quantization=per_channel_quantization,
         atol=0.25,
         qtol=1,
+        run_on_vulkan_runtime=False,
     )
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
-    # )
     pipeline.run()
 
 
@@ -127,9 +139,6 @@ def test_mv2_vgf_FP():
         exir_op=[],
         tosa_version="TOSA-1.0+FP",
         use_to_edge_transform_and_lower=True,
+        run_on_vulkan_runtime=False,
     )
-    # TODO: MLETORCH-1167 Create Vulkan backend e2e tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", get_test_inputs(), atol=3e-1, qtol=1.0
-    # )  # TODO: MLETORCH-1036 decrease tolerance
     pipeline.run()
diff --git a/backends/arm/test/models/test_mobilenet_v3_arm.py b/backends/arm/test/models/test_mobilenet_v3_arm.py
index 0dcbd9757ac..f3a8f27428b 100644
--- a/backends/arm/test/models/test_mobilenet_v3_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v3_arm.py
@@ -61,7 +61,6 @@ def test_mv3_u55_INT():
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         atol=0.5,
         qtol=1,
@@ -77,7 +76,6 @@ def test_mv3_u85_INT():
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         atol=0.5,
         qtol=1,
diff --git a/backends/arm/test/models/test_resnet18.py b/backends/arm/test/models/test_resnet18.py
index 6e965daeb8b..3cb21abd772 100644
--- a/backends/arm/test/models/test_resnet18.py
+++ b/backends/arm/test/models/test_resnet18.py
@@ -23,7 +23,8 @@
 model = model.eval()
 normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 
-model_inputs = (normalize(torch.randn((1, 3, 224, 224))),)
+# Using torch.rand * 2 - 1 to generate numbers in the range [-1;1] like an RGB image
+model_inputs = (normalize(torch.rand((1, 3, 224, 224)) * 2 - 1),)
 
 input_t = Tuple[torch.Tensor]
 
@@ -54,7 +55,7 @@ def test_resnet_tosa_INT(per_channel_quantization):
         exir_op=[],
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()
@@ -69,10 +70,9 @@ def test_resnet_u55_INT(per_channel_quantization):
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()
@@ -90,10 +90,9 @@ def test_resnet_u85_INT(per_channel_quantization):
         model_inputs,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
         per_channel_quantization=per_channel_quantization,
-        atol=0.5,
+        atol=0.25,
         qtol=1,
     )
     pipeline.run()
diff --git a/backends/arm/test/models/test_torch_functions.py b/backends/arm/test/models/test_torch_functions.py
index 580438f6da8..de45dbe0356 100644
--- a/backends/arm/test/models/test_torch_functions.py
+++ b/backends/arm/test/models/test_torch_functions.py
@@ -101,7 +101,6 @@ def forward(self, *args):
         "Requires dynamic output shape.",
         "topk": "NotImplementedError: No registered serialization name for <class 'torch.return_types.topk'> found",
         "sort": "NotImplementedError: No registered serialization name for <class 'torch.return_types.sort'> found",
-        "norm": "An error occurred when running the 'KeepDimsFalseToSqueezePass' pass after the following passes:",
     },
 )
 def test_torch_fns_FP(test_data):
diff --git a/backends/arm/test/models/test_w2l_arm.py b/backends/arm/test/models/test_w2l_arm.py
index c627cd7f887..d62d92f5fa2 100644
--- a/backends/arm/test/models/test_w2l_arm.py
+++ b/backends/arm/test/models/test_w2l_arm.py
@@ -91,7 +91,6 @@ def test_w2l_u55_INT():
         aten_ops=[],
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -106,7 +105,6 @@ def test_w2l_u85_INT():
         aten_ops=[],
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_abs.py b/backends/arm/test/ops/test_abs.py
index 4ebcf7393c1..26495b9df3a 100644
--- a/backends/arm/test/ops/test_abs.py
+++ b/backends/arm/test/ops/test_abs.py
@@ -55,7 +55,10 @@ def test_abs_tosa_INT(test_data: torch.Tensor):
 @common.XfailIfNoCorstone300
 def test_abs_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
-        Abs(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Abs(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -64,7 +67,10 @@ def test_abs_u55_INT(test_data: torch.Tensor):
 @common.XfailIfNoCorstone320
 def test_abs_u85_INT(test_data: torch.Tensor):
     pipeline = EthosU85PipelineINT[input_t1](
-        Abs(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Abs(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_acos.py b/backends/arm/test/ops/test_acos.py
index 28dadcf95be..f078f46f98e 100644
--- a/backends/arm/test/ops/test_acos.py
+++ b/backends/arm/test/ops/test_acos.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 from typing import Tuple
 
-import pytest
 import torch
 
 from executorch.backends.arm.test import common
@@ -105,10 +104,7 @@ def test_acos_vgf_FP(test_data: Tuple):
         tosa_version="TOSA-1.0+FP",
         run_on_vulkan_runtime=True,
     )
-    try:
-        pipeline.run()
-    except FileNotFoundError as e:
-        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
+    pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
@@ -122,7 +118,4 @@ def test_acos_vgf_INT(test_data: Tuple):
         tosa_version="TOSA-1.0+INT",
         run_on_vulkan_runtime=True,
     )
-    try:
-        pipeline.run()
-    except FileNotFoundError as e:
-        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_acosh.py b/backends/arm/test/ops/test_acosh.py
index 25ba2b1a83b..db0bd1c3281 100644
--- a/backends/arm/test/ops/test_acosh.py
+++ b/backends/arm/test/ops/test_acosh.py
@@ -87,7 +87,6 @@ def test_acosh_u55_INT_xfail(test_data: Tuple):
         Acosh(),
         (test_data(),),
         aten_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
@@ -110,7 +109,6 @@ def test_acosh_u85_INT_xfail(test_data: Tuple):
         Acosh(),
         (test_data(),),
         aten_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 24fdfbb5457..09c9d8fa224 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -7,7 +7,6 @@
 
 from typing import cast, Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer import arm_quantizer
 from executorch.backends.arm.quantizer.arm_quantizer import (
@@ -78,7 +77,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 class Add3(torch.nn.Module):
     def forward(self, x: torch.Tensor, y: torch.Tensor):
-        return x + y
+        return torch.add(x, y, alpha=1.5)
 
     test_data: list[input_t2] = {
         "3d_randn_diff_rank": lambda: (torch.randn(1, 4, 5), torch.randn(4, 1)),
@@ -144,7 +143,10 @@ def test_add_tensor_tosa_INT_i32(test_data: input_t1):
 @common.XfailIfNoCorstone300
 def test_add_tensor_u55_INT(test_data: input_t1):
     pipeline = EthosU55PipelineINT[input_t1](
-        Add(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Add(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -153,7 +155,10 @@ def test_add_tensor_u55_INT(test_data: input_t1):
 @common.XfailIfNoCorstone320
 def test_add_tensor_u85_INT(test_data: input_t1):
     pipeline = EthosU85PipelineINT[input_t1](
-        Add(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Add(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -186,7 +191,10 @@ def test_add_tensor_tosa_INT_2(test_data: input_t2):
 @common.XfailIfNoCorstone300
 def test_add_tensor_u55_INT_2(test_data: input_t2):
     pipeline = EthosU55PipelineINT[input_t2](
-        Add2(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Add2(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -195,7 +203,10 @@ def test_add_tensor_u55_INT_2(test_data: input_t2):
 @common.XfailIfNoCorstone320
 def test_add_tensor_u85_INT_2(test_data: input_t2):
     pipeline = EthosU85PipelineINT[input_t2](
-        Add2(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Add2(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -211,10 +222,7 @@ def test_add_tensor_vgf_FP(test_data: input_t1):
         tosa_version="TOSA-1.0+FP",
         run_on_vulkan_runtime=True,
     )
-    try:
-        pipeline.run()
-    except FileNotFoundError as e:
-        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
+    pipeline.run()
 
 
 @common.parametrize("test_data", Add.test_data)
@@ -228,10 +236,7 @@ def test_add_tensor_vgf_INT(test_data: input_t1):
         tosa_version="TOSA-1.0+INT",
         run_on_vulkan_runtime=True,
     )
-    try:
-        pipeline.run()
-    except FileNotFoundError as e:
-        pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
+    pipeline.run()
 
 
 def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):
@@ -254,9 +259,6 @@ def get_symmetric_a16w8_add_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", Add.test_data)
-@pytest.mark.xfail(
-    reason="missing int16 add ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13730"
-)
 def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test add operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -282,9 +284,6 @@ def test_add_tensor_16a8w_tosa_INT(test_data: input_t1):
 
 @common.parametrize("test_data", Add.test_data)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 add operations. See: https://github.com/pytorch/executorch/issues/13730"
-)
 def test_add_tensor_16a8w_u55_INT16(test_data: input_t1):
     """Test add operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -296,7 +295,6 @@ def test_add_tensor_16a8w_u55_INT16(test_data: input_t1):
         exir_op,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -310,9 +308,6 @@ def test_add_tensor_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", Add.test_data)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 add operations. See: https://github.com/pytorch/executorch/issues/13730"
-)
 def test_add_tensor_16a8w_u85_INT16(test_data: input_t1):
     """Test add operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -324,7 +319,6 @@ def test_add_tensor_16a8w_u85_INT16(test_data: input_t1):
         exir_op,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index 753cb599b2b..685b69b3541 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -211,9 +211,6 @@ def get_symmetric_a16w8_addmm_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 addmm ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13979"
-)
 def test_addmm_16a8w_tosa_INT(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -253,7 +250,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -267,9 +263,6 @@ def test_addmm_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 addmm operations"
-)
 def test_addmm_16a8w_u85_INT16(test_data: input_t1):
     """Test addmm (FC layer) operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -281,7 +274,6 @@ def test_addmm_16a8w_u85_INT16(test_data: input_t1):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_amax.py b/backends/arm/test/ops/test_amax.py
index 080dddda92e..e69e9163325 100644
--- a/backends/arm/test/ops/test_amax.py
+++ b/backends/arm/test/ops/test_amax.py
@@ -103,7 +103,6 @@ def test_amax_u85_INT(test_data: Amax.input_t):
         Amax(dim, keep_dims),
         data,
         Amax.aten_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_amin.py b/backends/arm/test/ops/test_amin.py
index a24da9e1ba0..09d9018c73e 100644
--- a/backends/arm/test/ops/test_amin.py
+++ b/backends/arm/test/ops/test_amin.py
@@ -29,12 +29,16 @@ def __init__(self, dim, keep_dims):
         super().__init__()
 
     def forward(self, x):
-        return torch.amin(x, self.dim, self.keep_dims)
+        if self.dim is None:
+            return torch.amin(x, keepdim=self.keep_dims)
+        else:
+            return torch.amin(x, self.dim, self.keep_dims)
 
-    test_data: Dict[str, input_t] = {
+    test_data: Dict = {
         "rank_1_dim_0": lambda: ((torch.rand([10]),), 0, False),
         "rank_2_dim_1_keep_dims": lambda: ((torch.rand([2, 2]),), (1,), True),
         "rank_4_all_dim": lambda: ((torch.rand([1, 2, 5, 5]),), (0, 1, 2, 3), False),
+        "rank_4_no_dim": lambda: ((torch.rand([1, 2, 5, 5]),), None, False),
         "rank_4_0,3_keep_dims": lambda: ((torch.rand([1, 2, 2, 2]),), (0, 3), True),
         "rank_4_mult_batches": lambda: ((torch.rand([2, 2, 2, 2]),), (0), True),
     }
@@ -52,7 +56,7 @@ def forward(self, x):
         x = torch.min(x, self.dim)
         return x[0]
 
-    test_data: Dict[str, input_t] = {
+    test_data: Dict = {
         "rank_1_dim_0": lambda: ((torch.rand([10]),), 0),
         "rank_2_dim_1": lambda: ((torch.rand([2, 2]),), 1),
         "rank_4_dim_2": lambda: ((torch.rand([2, 2, 2, 2]),), 2),
@@ -112,7 +116,6 @@ def test_amin_u85_INT(test_data: Amin.input_t):
         Amin(dim, keep_dims),
         data,
         Amin.aten_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_any.py b/backends/arm/test/ops/test_any.py
index ae738480048..3eccff0a64e 100644
--- a/backends/arm/test/ops/test_any.py
+++ b/backends/arm/test/ops/test_any.py
@@ -177,7 +177,6 @@ def test_any_u85_INT(test_data: input_t1):
         test_input(),
         op.aten_op,
         op.exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
diff --git a/backends/arm/test/ops/test_avg_pool2d.py b/backends/arm/test/ops/test_avg_pool2d.py
index be54c76e68b..8310d1e40a4 100644
--- a/backends/arm/test/ops/test_avg_pool2d.py
+++ b/backends/arm/test/ops/test_avg_pool2d.py
@@ -151,7 +151,6 @@ def test_avg_pool2d_u55_INT(test_module):
         input_tensor,
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -166,7 +165,6 @@ def test_avg_pool2d_u85_INT(test_module):
         input_tensor,
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_batch_norm.py b/backends/arm/test/ops/test_batch_norm.py
index a28180b7b57..fc5e11645dd 100644
--- a/backends/arm/test/ops/test_batch_norm.py
+++ b/backends/arm/test/ops/test_batch_norm.py
@@ -220,7 +220,6 @@ def test_native_batch_norm_legit_no_training_u55_INT_conv(test_data: Tuple):
         BatchNorm2dConv(*model_params),
         (test_data,),
         aten_ops=BatchNorm2dConv.aten_ops[0],  # Bn is removed before check
-        run_on_fvp=True,
         qtol=1,
     )
     pipeline.run()
@@ -234,7 +233,6 @@ def test_native_batch_norm_legit_no_training_u85_INT_conv(test_data: Tuple):
         BatchNorm2dConv(*model_params),
         (test_data,),
         aten_ops=BatchNorm2dConv.aten_ops[0],  # Bn is removed before check
-        run_on_fvp=True,
         qtol=1,
     )
     pipeline.run()
@@ -336,7 +334,6 @@ def test_native_batch_norm_legit_no_stats_u55_INT(test_data: Tuple):
         BatchNorm2dNoStats(*model_params),
         (test_data,),
         aten_op=BatchNorm2dNoStats.aten_ops,
-        run_on_fvp=True,
         qtol=1,
     )
     pipeline.run()
@@ -353,7 +350,6 @@ def test_native_batch_norm_legit_no_stats_u85_INT(test_data: Tuple):
         BatchNorm2dNoStats(*model_params),
         (test_data,),
         aten_op=BatchNorm2dNoStats.aten_ops,
-        run_on_fvp=False,
         qtol=1,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_bitwise.py b/backends/arm/test/ops/test_bitwise.py
index 218f2290cab..f9b20e5dbdd 100644
--- a/backends/arm/test/ops/test_bitwise.py
+++ b/backends/arm/test/ops/test_bitwise.py
@@ -235,7 +235,6 @@ def test_bitwise_and_scalar_u85_INT(test_data: input_t2):
         test_data(),
         AndScalar.aten_op,
         AndScalar.exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -253,7 +252,6 @@ def test_bitwise_and_tensor_u85_INT(test_data: input_t2):
         test_data(),
         And().aten_op,
         And().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -418,7 +416,6 @@ def test_bitwise_xor_tensor_u85_INT(test_data: input_t2):
         test_data(),
         Xor().aten_op,
         Xor().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -436,7 +433,6 @@ def test_bitwise_xor_scalar_u85_INT(test_data: input_t2):
         test_data(),
         XorScalar.aten_op,
         XorScalar.exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -601,7 +597,6 @@ def test_bitwise_or_tensor_u85_INT(test_data: input_t2):
         test_data(),
         Or().aten_op,
         Or().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -619,7 +614,6 @@ def test_bitwise_or_scalar_u85_INT(test_data: input_t2):
         test_data(),
         OrScalar.aten_op,
         OrScalar.exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
diff --git a/backends/arm/test/ops/test_bitwise_not.py b/backends/arm/test/ops/test_bitwise_not.py
new file mode 100644
index 00000000000..4f48bc134ba
--- /dev/null
+++ b/backends/arm/test/ops/test_bitwise_not.py
@@ -0,0 +1,120 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU85PipelineINT,
+    OpNotSupportedPipeline,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+aten_op = "torch.ops.aten.bitwise_not.default"
+exir_op = "executorch_exir_dialects_edge__ops_aten_bitwise_not_default"
+
+input_t1 = Tuple[torch.Tensor]
+
+test_data_suite = {
+    "zeros": torch.zeros(1, 10, 10, 10, dtype=torch.int32),
+    "ones": torch.ones(10, 2, 3, dtype=torch.int8),
+    "pattern1_int8": 0xAA * torch.ones(1, 2, 2, 2, dtype=torch.int8),
+    "pattern1_int16": 0xAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int16),
+    "pattern1_int32": 0xAAAAAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int32),
+    "pattern2_int8": 0xCC * torch.ones(1, 2, 2, 2, dtype=torch.int8),
+    "pattern2_int16": 0xCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int16),
+    "pattern2_int32": 0xCCCCCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int32),
+    "rand_rank2": torch.randint(-128, 127, (10, 10), dtype=torch.int8),
+    "rand_rank4": torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8),
+}
+
+
+class BitwiseNot(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return torch.bitwise_not(x)
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_bitwise_not_tosa_FP(test_data: Tuple):
+    # We don't delegate bitwise_not since it is not supported on the FP profile.
+    pipeline = OpNotSupportedPipeline[input_t1](
+        BitwiseNot(),
+        (test_data,),
+        {exir_op: 1},
+        quantize=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_bitwise_not_tosa_INT(test_data: Tuple):
+    pipeline = TosaPipelineINT[input_t1](
+        BitwiseNot(),
+        (test_data,),
+        aten_op=aten_op,
+        exir_op=exir_op,
+    )
+    pipeline.pop_stage("quantize")
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_bitwise_not_u55_INT(test_data: Tuple):
+    # We don't delegate bitwise_not since it is not supported on U55.
+    pipeline = OpNotSupportedPipeline[input_t1](
+        BitwiseNot(),
+        (test_data,),
+        {exir_op: 1},
+        quantize=True,
+        u55_subset=True,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_data", test_data_suite)
+def test_bitwise_not_u85_INT(test_data: Tuple):
+    pipeline = EthosU85PipelineINT[input_t1](
+        BitwiseNot(),
+        (test_data,),
+        aten_ops=aten_op,
+        exir_ops=exir_op,
+    )
+    pipeline.pop_stage("quantize")
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoModelConverter
+def test_bitwise_not_vgf_FP(test_data: Tuple):
+    # We don't delegate bitwise_not since it is not supported on the FP profile.
+    pipeline = OpNotSupportedPipeline[input_t1](
+        BitwiseNot(),
+        (test_data,),
+        {exir_op: 1},
+        quantize=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoModelConverter
+def test_bitwise_not_vgf_INT(test_data: Tuple):
+    pipeline = VgfPipeline[input_t1](
+        BitwiseNot(),
+        (test_data,),
+        aten_op,
+        exir_op,
+        tosa_version="TOSA-1.0+INT",
+    )
+    pipeline.pop_stage("quantize")
+    pipeline.pop_stage("check.quant_nodes")
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 7c0fc1665bb..f69b1419c8d 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -97,7 +97,6 @@ def test_bmm_u55_INT(test_data: input_t1):
         test_data(),
         aten_op_bmm,
         exir_op_bmm,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -110,7 +109,6 @@ def test_bmm_u85_INT(test_data: input_t1):
         test_data(),
         aten_op_bmm,
         exir_op_bmm,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -123,7 +121,6 @@ def test_bmm_u55_INT_single_input(test_data: input_t1):
         test_data(),
         aten_op_bmm,
         exir_op_bmm,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -136,7 +133,6 @@ def test_bmm_u85_INT_single_input(test_data: input_t1):
         test_data(),
         aten_op_bmm,
         exir_op_bmm,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -150,7 +146,11 @@ def test_bmm_vgf_FP(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", BMMSingleInput.test_data_generators)
+@common.parametrize(
+    "test_data",
+    BMMSingleInput.test_data_generators,
+    flakies={"rand_big_1": 3},
+)
 @common.SkipIfNoModelConverter
 def test_bmm_vgf_FP_single_input(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
@@ -186,6 +186,4 @@ def test_bmm_vgf_INT_single_input(test_data: input_t1):
         exir_op_bmm,
         tosa_version="TOSA-1.0+INT",
     )
-    # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests
-    # pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index 84ecd8641b5..254edbc411f 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -120,7 +119,6 @@ def test_cat_u55_INT(test_data: Tuple):
         test_data(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -133,7 +131,6 @@ def test_cat_u85_INT(test_data: Tuple):
         test_data(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -180,9 +177,6 @@ def get_symmetric_a16w8_cat_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", Cat.test_parameters)
-@pytest.mark.xfail(
-    reason="missing int16 cat ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13978"
-)
 def test_cat_16a8w_tosa_INT(test_data: Tuple):
     """Test cat operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -208,9 +202,6 @@ def test_cat_16a8w_tosa_INT(test_data: Tuple):
 
 @common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
-)
 def test_cat_16a8w_u55_INT16(test_data: Tuple):
     """Test cat operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -222,7 +213,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple):
         exir_op,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -236,9 +226,6 @@ def test_cat_16a8w_u55_INT16(test_data: Tuple):
 
 @common.parametrize("test_data", Cat.test_parameters)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 cat operations"
-)
 def test_cat_16a8w_u85_INT16(test_data: Tuple):
     """Test cat operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -250,7 +237,6 @@ def test_cat_16a8w_u85_INT16(test_data: Tuple):
         exir_op,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_ceil.py b/backends/arm/test/ops/test_ceil.py
index 64e9040a974..ed304bbd9df 100644
--- a/backends/arm/test/ops/test_ceil.py
+++ b/backends/arm/test/ops/test_ceil.py
@@ -78,7 +78,6 @@ def test_ceil_u55_INT(test_data: input_t1):
         (data,),
         module.aten_op,
         module.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -92,7 +91,6 @@ def test_ceil_u85_INT(test_data: input_t1):
         (data,),
         module.aten_op,
         module.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_clamp.py b/backends/arm/test/ops/test_clamp.py
index ba490ccc0c6..a5561802e44 100644
--- a/backends/arm/test/ops/test_clamp.py
+++ b/backends/arm/test/ops/test_clamp.py
@@ -96,7 +96,6 @@ def test_clamp_u55_INT(test_data):
         (input_tensor,),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
 
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -115,7 +114,6 @@ def test_clamp_u85_INT(test_data):
         (input_tensor,),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
 
@@ -149,6 +147,4 @@ def test_clamp_vgf_INT(test_data):
         exir_op,
         tosa_version="TOSA-1.0+INT",
     )
-    # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests
-    # pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index b240fb1ea07..8a6d3714b8b 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -102,7 +102,6 @@ def test_clone_u55_INT(input_data):
         input_tensor,
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
 
     pipeline.run()
@@ -118,7 +117,6 @@ def test_clone_u85_INT(input_data):
         input_tensor,
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
 
     pipeline.run()
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index ac66bc1556b..d58cdb5ff61 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -307,7 +307,6 @@ def test_convolution_1d_u55_INT(test_data):
         model.get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
         qtol=1,
     )
@@ -323,7 +322,6 @@ def test_convolution_1d_u85_INT(test_data):
         model.get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
         qtol=1,
     )
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 0300f7c2049..bf47e3fa084 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -426,7 +426,6 @@ def test_convolution_2d_u55_INT(test_data):
         model.get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -441,7 +440,6 @@ def test_convolution_u85_INT(test_data):
         model.get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_conv3d.py b/backends/arm/test/ops/test_conv3d.py
index b26f75daa1a..46986103aa0 100644
--- a/backends/arm/test/ops/test_conv3d.py
+++ b/backends/arm/test/ops/test_conv3d.py
@@ -367,7 +367,6 @@ def test_convolution_3d_u55_INT(test_data):
         model.get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -382,7 +381,6 @@ def test_convolution_3d_u85_INT(test_data):
         model.get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index a7a031468ea..f0f8b404594 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -258,7 +258,6 @@ def test_convolution_2d_u55_INT_meandim():
         model.get_inputs(),
         aten_ops=[],
         exir_ops=ComboConv2dMeandim.edge_op_list,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -271,7 +270,6 @@ def test_convolution_2d_u85_INT_meandim():
         model.get_inputs(),
         aten_ops=[],
         exir_ops=ComboConv2dMeandim.edge_op_list,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -346,7 +344,6 @@ def test_convolution_2d_u55_INT_batchnorm_relu6(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -362,7 +359,6 @@ def test_convolution_2d_u85_INT_batchnorm_relu6(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -441,7 +437,6 @@ def test_convolution_2d_u55_INT_relu6(test_data):
         input,
         aten_ops=[],
         exir_ops=ComboConvRelu6.edge_op_list,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -457,7 +452,6 @@ def test_convolution_2d_u85_INT_relu6(test_data):
         input,
         aten_ops=[],
         exir_ops=ComboConvRelu6.edge_op_list,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -533,7 +527,6 @@ def test_convolution_2d_u55_INT_block_bottleneck(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -549,7 +542,6 @@ def test_convolution_2d_u85_INT_block_bottleneck(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -581,8 +573,6 @@ def test_convolution_2d_vgf_INT_block_bottleneck(test_data):
         tosa_version="TOSA-1.0+INT",
         per_channel_quantization=per_channel_quantization,
     )
-    # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests
-    # pipeline.change_args("run_method_and_compare_outputs", model.get_inputs(), qtol=1)
     pipeline.run()
 
 
@@ -628,7 +618,6 @@ def test_convolution_2d_u55_INT_avgpool2d(test_data):
         input,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -644,7 +633,6 @@ def test_convolution_2d_u85_INT_avgpool2d(test_data):
         input,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_cos.py b/backends/arm/test/ops/test_cos.py
index acb950f2a2e..b0c35bf7878 100644
--- a/backends/arm/test/ops/test_cos.py
+++ b/backends/arm/test/ops/test_cos.py
@@ -66,25 +66,25 @@ def test_cos_tosa_INT(test_data: Tuple):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_cos_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Cos(),
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_cos_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Cos(),
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 0f8b34d3d47..e49ab236d86 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -260,7 +260,6 @@ def test_convolution_2d_u55_INT_depthwise(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -275,7 +274,6 @@ def test_convolution_1d_u55_INT_depthwise(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -290,7 +288,6 @@ def test_convolution_2d_u85_INT_depthwise(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
@@ -305,7 +302,6 @@ def test_convolution_1d_u85_INT_depthwise(test_data):
         model.get_inputs(),
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 5bacac1c962..612622b46cb 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -109,7 +109,6 @@ def test_div_tensor_u55_INT(test_data: Tuple):
         test_data(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -122,7 +121,6 @@ def test_div_tensor_u85_INT(test_data: Tuple):
         test_data(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_div_tensor_mode.py b/backends/arm/test/ops/test_div_tensor_mode.py
index 909b83bd97f..e1f6036a487 100644
--- a/backends/arm/test/ops/test_div_tensor_mode.py
+++ b/backends/arm/test/ops/test_div_tensor_mode.py
@@ -96,7 +96,6 @@ def test_div_tensor_mode_u55_INT(data):
         aten_ops=model.aten_ops_int,
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -113,7 +112,6 @@ def test_div_tensor_mode_u85_INT(data):
         aten_ops=model.aten_ops_int,
         exir_ops=[],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py
index b840869ba48..8f783240a2c 100644
--- a/backends/arm/test/ops/test_eq.py
+++ b/backends/arm/test/ops/test_eq.py
@@ -165,7 +165,6 @@ def test_eq_scalar_u85_INT_tensor(test_module):
         test_module().get_inputs(),
         Equal.aten_op_Tensor,
         Equal.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -185,7 +184,6 @@ def test_eq_scalar_u85_INT(test_module):
         test_module().get_inputs(),
         Equal.aten_op_Tensor,
         Equal.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_erf.py b/backends/arm/test/ops/test_erf.py
index 363b1e2d8c9..e6b28255d6b 100644
--- a/backends/arm/test/ops/test_erf.py
+++ b/backends/arm/test/ops/test_erf.py
@@ -50,7 +50,10 @@ def test_erf_tosa_INT(test_data: input_t1):
 @common.XfailIfNoCorstone300
 def test_erf_u55_INT(test_data: input_t1):
     pipeline = EthosU55PipelineINT[input_t1](
-        Erf(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Erf(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
@@ -59,7 +62,10 @@ def test_erf_u55_INT(test_data: input_t1):
 @common.XfailIfNoCorstone320
 def test_erf_u85_INT(test_data: input_t1):
     pipeline = EthosU85PipelineINT[input_t1](
-        Erf(), test_data(), aten_op, exir_op, run_on_fvp=True
+        Erf(),
+        test_data(),
+        aten_op,
+        exir_op,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
index 6eaacc71d86..56d258944c2 100644
--- a/backends/arm/test/ops/test_exp.py
+++ b/backends/arm/test/ops/test_exp.py
@@ -68,7 +68,6 @@ def test_exp_u55_INT(test_data: Tuple):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -81,7 +80,6 @@ def test_exp_u85_INT(test_data: Tuple):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index b5784c9ff93..34694469bbf 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -79,7 +79,6 @@ def test_expand_u55_INT(test_data: Tuple):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -92,7 +91,6 @@ def test_expand_u85_INT(test_data: Tuple):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_fill_scalar.py b/backends/arm/test/ops/test_fill_scalar.py
new file mode 100644
index 00000000000..fb84d993575
--- /dev/null
+++ b/backends/arm/test/ops/test_fill_scalar.py
@@ -0,0 +1,108 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+aten_op = "torch.ops.aten.fill_.Scalar"
+exir_op = "executorch_exir_dialects_edge__ops_aten_full_like_default"
+
+input_t1 = Tuple[torch.Tensor]
+
+test_data_suite = {
+    "ones_float": [torch.ones(2, 3), 5.0],
+    "ones_int": [torch.ones(2, 3), -3],
+}
+
+
+class FillScalar(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, y: torch.Tensor, fill_value: int | float):
+        mask = torch.full_like(y, 0)
+        mask.fill_(fill_value)
+        return mask * y
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_fill_scalar_tosa_FP(test_data: Tuple):
+    pipeline = TosaPipelineFP[input_t1](
+        FillScalar(),
+        (*test_data,),
+        aten_op=aten_op,
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_fill_scalar_tosa_INT(test_data: Tuple):
+    pipeline = TosaPipelineINT[input_t1](
+        FillScalar(),
+        (*test_data,),
+        aten_op=aten_op,
+        exir_op=exir_op,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("test_data", test_data_suite)
+def test_fill_scalar_u55_INT(test_data: Tuple):
+    pipeline = EthosU55PipelineINT[input_t1](
+        FillScalar(),
+        (*test_data,),
+        aten_ops=[aten_op],
+        exir_ops=exir_op,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_data", test_data_suite)
+def test_fill_scalar_u85_INT(test_data: Tuple):
+    pipeline = EthosU85PipelineINT[input_t1](
+        FillScalar(),
+        (*test_data,),
+        aten_ops=[aten_op],
+        exir_ops=exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoModelConverter
+def test_fill_scalar_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        FillScalar(),
+        (*test_data,),
+        aten_op,
+        exir_op,
+        tosa_version="TOSA-1.0+FP",
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite)
+@common.SkipIfNoModelConverter
+def test_fill_scalar_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        FillScalar(),
+        (*test_data,),
+        aten_op,
+        exir_op,
+        tosa_version="TOSA-1.0+INT",
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_floor.py b/backends/arm/test/ops/test_floor.py
index c66ef1c5d27..475fe18679a 100644
--- a/backends/arm/test/ops/test_floor.py
+++ b/backends/arm/test/ops/test_floor.py
@@ -78,7 +78,6 @@ def test_floor_u55_INT(test_data: input_t1):
         (data,),
         module.aten_op,
         module.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -92,7 +91,6 @@ def test_floor_u85_INT(test_data: input_t1):
         (data,),
         module.aten_op,
         module.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 9e2c9b4d8be..8ab063e9957 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -202,7 +202,6 @@ def test_full_u85_INT(test_data: Tuple):
         test_data,
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -216,7 +215,6 @@ def test_full_u55_INT(test_data: Tuple):
         test_data,
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py
index 94f33d28630..ede5be76eda 100644
--- a/backends/arm/test/ops/test_ge.py
+++ b/backends/arm/test/ops/test_ge.py
@@ -161,7 +161,6 @@ def test_ge_tensor_u85_INT(test_module):
         test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -177,7 +176,6 @@ def test_ge_scalar_u85_INT(test_module):
         test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_group_norm.py b/backends/arm/test/ops/test_group_norm.py
index 0f314064548..8f2c0f0d6a5 100644
--- a/backends/arm/test/ops/test_group_norm.py
+++ b/backends/arm/test/ops/test_group_norm.py
@@ -118,7 +118,6 @@ def test_native_group_norm_u55_INT(test_data):
         test_data[1],
         test_data[0],
         "torch.ops.aten.sub.Tensor",  # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed
-        run_on_fvp=True,
         atol=0.1,  # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm"
     )
     pipeline.change_args("run_method_and_compare_outputs", atol=1, qtol=1)
@@ -142,7 +141,6 @@ def test_native_group_norm_u85_INT(test_data):
         test_data[1],
         test_data[0],
         "torch.ops.aten.sub.Tensor",  # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed
-        run_on_fvp=True,
         atol=0.1,  # TODO: "MLETORCH-925: Fix numerical issue for aten.native_group_norm"
     )
     pipeline.change_args("run_method_and_compare_outputs", atol=1, qtol=1)
diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py
index 41229397eb5..0e50b6b78be 100644
--- a/backends/arm/test/ops/test_gt.py
+++ b/backends/arm/test/ops/test_gt.py
@@ -162,7 +162,6 @@ def test_gt_tensor_u85_INT(test_module):
         test_module().get_inputs(),
         Greater.aten_op_tensor,
         Greater.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -178,7 +177,6 @@ def test_gt_scalar_u85_INT(test_module):
         test_module().get_inputs(),
         Greater.aten_op_tensor,
         Greater.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_hardsigmoid.py b/backends/arm/test/ops/test_hardsigmoid.py
index 5f591c15617..568eb069f8b 100644
--- a/backends/arm/test/ops/test_hardsigmoid.py
+++ b/backends/arm/test/ops/test_hardsigmoid.py
@@ -70,7 +70,6 @@ def test_hardsigmoid_u55_INT(test_data: torch.Tensor):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -84,7 +83,6 @@ def test_hardsigmoid_u85_INT(test_data: torch.Tensor):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_hardswish.py b/backends/arm/test/ops/test_hardswish.py
index 00db0cb296b..760293ec492 100644
--- a/backends/arm/test/ops/test_hardswish.py
+++ b/backends/arm/test/ops/test_hardswish.py
@@ -62,7 +62,6 @@ def test_hardswish_u55_INT(test_data):
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     ).run()
 
@@ -75,7 +74,6 @@ def test_hardswish_u85_INT(test_data):
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     ).run()
 
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
index 28f7e717351..3bb8e212cc9 100644
--- a/backends/arm/test/ops/test_hardtanh.py
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -71,7 +71,6 @@ def test_hardtanh_u55_INT(test_data: torch.Tensor):
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -84,7 +83,6 @@ def test_hardtanh_u85_INT(test_data: torch.Tensor):
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_index_select.py b/backends/arm/test/ops/test_index_select.py
index 95ebaa62a38..6d2a6d73b70 100644
--- a/backends/arm/test/ops/test_index_select.py
+++ b/backends/arm/test/ops/test_index_select.py
@@ -174,8 +174,4 @@ def test_index_select_vgf_INT_rand(test_data: input_params):
         op.exir_op,
         tosa_version="TOSA-1.0+INT",
     )
-    # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests
-    # pipeline.change_args(
-    #     "run_method_and_compare_outputs", inputs=test_input, atol=0.9, rtol=0.2, qtol=1
-    # )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 2c9b83dc7e7..2659bc2eab4 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -95,7 +95,6 @@ def test_native_layer_norm_u55_INT(test_data):
         model,
         test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.run()
@@ -109,7 +108,6 @@ def test_native_layer_norm_u85_INT(test_data):
         model,
         test_data,
         "torch.ops.aten.sub.Tensor",  # Just check for sub op included in the layernorm decomposition
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py
index 31422302a2d..fd0e63e9beb 100644
--- a/backends/arm/test/ops/test_le.py
+++ b/backends/arm/test/ops/test_le.py
@@ -163,7 +163,6 @@ def test_le_tensor_u85_INT(test_module):
         test_module().get_inputs(),
         LessEqual.aten_op_tensor,
         LessEqual.exir_op,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -180,7 +179,6 @@ def test_le_scalar_u85_INT(test_module):
         test_module().get_inputs(),
         LessEqual.aten_op_tensor,
         LessEqual.exir_op,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_leaky_relu.py b/backends/arm/test/ops/test_leaky_relu.py
index 432c4da7ecc..a7ae4cb8564 100644
--- a/backends/arm/test/ops/test_leaky_relu.py
+++ b/backends/arm/test/ops/test_leaky_relu.py
@@ -73,7 +73,6 @@ def test_leaky_relu_u55_INT(test_data):
         LeakyReLU(slope),
         data,
         [],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
@@ -88,7 +87,6 @@ def test_leaky_relu_u85_INT(test_data):
         LeakyReLU(slope),
         data,
         [],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
diff --git a/backends/arm/test/ops/test_linalg_vector_norm.py b/backends/arm/test/ops/test_linalg_vector_norm.py
index 1777cffb0a7..df3bef38cc1 100644
--- a/backends/arm/test/ops/test_linalg_vector_norm.py
+++ b/backends/arm/test/ops/test_linalg_vector_norm.py
@@ -103,7 +103,6 @@ def test_vector_norm_u55_INT_fvp(test_module):
         input_tensor,
         aten_op_q_decomposed_q,
         exir_op_q_decomposed,
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.pop_stage("check_not.exir")
@@ -121,7 +120,6 @@ def test_vector_norm_u85_INT_fvp(test_module):
         input_tensor,
         aten_op_q_decomposed_q,
         exir_op_q_decomposed,
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.pop_stage("check_not.exir")
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index e5d00c83e9f..4029fcef54e 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -181,7 +180,6 @@ def test_linear_u55_INT(test_data: torch.Tensor):
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
     ).run()
@@ -204,7 +202,6 @@ def test_linear_u85_INT(test_data: torch.Tensor):
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
     ).run()
@@ -276,10 +273,14 @@ def get_symmetric_a16w8_linear_quantizer(
     )
 
 
-@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
-@pytest.mark.xfail(
-    reason="missing int16 linear ops support; fails at TOSA reference model run with Invalid TOSA graph"
-)
+test_data_all_16a8w = test_data_rank1_INT | test_data_rank4_INT
+# TODO: Remove large rand test as they are flaky until sorted out why: MLETORCH-1377
+for k in list(test_data_all_16a8w.keys()):
+    if "large_rand" in k:
+        test_data_all_16a8w.pop(k)
+
+
+@common.parametrize("test_data", test_data_all_16a8w)
 def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
     """Test linear operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     test_data, out_features, has_bias, per_channel_quantization = test_data()
@@ -308,3 +309,63 @@ def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
     )
     # Run the pipeline
     pipeline.run()
+
+
+@common.parametrize("test_data", test_data_all_16a8w)
+@common.XfailIfNoCorstone300
+def test_linear_16a8w_u55_INT16(test_data: torch.Tensor):
+    """Test linear operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
+    test_data, out_features, has_bias, per_channel_quantization = test_data()
+    in_features = test_data.shape[-1]
+
+    pipeline = EthosU55PipelineINT[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_linear_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_all_16a8w)
+@common.XfailIfNoCorstone320
+def test_linear_16a8w_u85_INT16(test_data: torch.Tensor):
+    """Test linear operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
+    test_data, out_features, has_bias, per_channel_quantization = test_data()
+    in_features = test_data.shape[-1]
+
+    pipeline = EthosU85PipelineINT[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_ops=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_linear_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
index 1ed5c57f1ab..44811715407 100644
--- a/backends/arm/test/ops/test_log.py
+++ b/backends/arm/test/ops/test_log.py
@@ -60,7 +60,6 @@ def test_log_u55_INT(test_data: input_t1):
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     ).run()
 
 
@@ -72,7 +71,6 @@ def test_log_u85_INT(test_data: input_t1):
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     ).run()
 
 
diff --git a/backends/arm/test/ops/test_logical.py b/backends/arm/test/ops/test_logical.py
index 2b160ce7b50..e772840e6e6 100644
--- a/backends/arm/test/ops/test_logical.py
+++ b/backends/arm/test/ops/test_logical.py
@@ -137,7 +137,6 @@ def test_logical_and_u85_INT(test_data: input_t2):
         test_data(),
         And().aten_op,
         And().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -231,7 +230,6 @@ def test_logical_xor_u85_INT(test_data: input_t2):
         test_data(),
         Xor().aten_op,
         Xor().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -325,7 +323,6 @@ def test_logical_or_u85_INT(test_data: input_t2):
         test_data(),
         Or().aten_op,
         Or().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
@@ -419,7 +416,6 @@ def test_logical_not_u85_INT(test_data: input_t2):
         test_data(),
         Not().aten_op,
         Not().exir_op,
-        run_on_fvp=True,
         atol=0,
         rtol=0,
         qtol=0,
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index c4a68caabac..f0411847dd3 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -72,7 +72,6 @@ def test_log_softmax_u55_INT(test_data):
         LogSoftmax(dim),
         data,
         [],
-        run_on_fvp=True,
     )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -87,7 +86,6 @@ def test_log_softmax_u85_INT(test_data):
         LogSoftmax(dim),
         data,
         [],
-        run_on_fvp=True,
     )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
@@ -119,6 +117,4 @@ def test_log_softmax_vgf_INT(test_data):
         tosa_version="TOSA-1.0+INT",
     )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
-    # TODO: MLETORCH-1136 Change args of run_method_and_compare_outputs of the vgf tests
-    # pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_lshift.py b/backends/arm/test/ops/test_lshift.py
index bab364a4528..3af49cd4dc2 100644
--- a/backends/arm/test/ops/test_lshift.py
+++ b/backends/arm/test/ops/test_lshift.py
@@ -103,7 +103,6 @@ def test_bitwise_left_shift_tensor_u55_INT_scalar(test_data):
         test_data,
         LshiftScalar.torch_op_INT,
         LshiftScalar.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -117,7 +116,6 @@ def test_bitwise_left_shift_tensor_u85_INT_scalar(test_data):
         test_data,
         LshiftScalar.torch_op_INT,
         LshiftScalar.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -178,28 +176,26 @@ def test_bitwise_left_shift_tensor_tosa_INT(test_data):
 
 
 @common.parametrize("test_data", LshiftTensor.test_data)
-@XfailIfNoCorstone300
+@common.XfailIfNoCorstone300
 def test_bitwise_left_shift_tensor_u55_INT(test_data):
     pipeline = EthosU55PipelineINT[scalar_input_t](
         LshiftTensor(),
         test_data,
         LshiftTensor.torch_op,
         LshiftTensor.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
 
 
 @common.parametrize("test_data", LshiftTensor.test_data)
-@XfailIfNoCorstone320
+@common.XfailIfNoCorstone320
 def test_bitwise_left_shift_tensor_u85_INT(test_data):
     pipeline = EthosU85PipelineINT[scalar_input_t](
         LshiftTensor(),
         test_data,
         LshiftTensor.torch_op,
         LshiftTensor.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py
index 98d0298b195..d0ed1a34185 100644
--- a/backends/arm/test/ops/test_lt.py
+++ b/backends/arm/test/ops/test_lt.py
@@ -162,7 +162,6 @@ def test_lt_tensor_u85_INT(test_module):
         test_module().get_inputs(),
         LessThan.aten_op_tensor,
         LessThan.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -178,7 +177,6 @@ def test_lt_scalar_u85_INT(test_module):
         test_module().get_inputs(),
         LessThan.aten_op_tensor,
         LessThan.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_matmul.py b/backends/arm/test/ops/test_matmul.py
index d1a21684325..f564672e98f 100644
--- a/backends/arm/test/ops/test_matmul.py
+++ b/backends/arm/test/ops/test_matmul.py
@@ -22,6 +22,7 @@
 
 class MatMul(torch.nn.Module):
     test_data_generators = {
+        "rand_rand_2d": lambda: (torch.rand(5, 5), torch.rand(5, 2)),
         "rand_rand_3d": lambda: (torch.rand(2, 3, 5), torch.rand(2, 5, 2)),
         "rand_rand_4d": lambda: (torch.rand(1, 2, 3, 5), torch.rand(1, 2, 5, 2)),
     }
@@ -32,6 +33,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 
 class MatMulSingleInput(torch.nn.Module):
     test_data_generators = {
+        "rand_2d": lambda: (torch.rand(5, 5),),
         "rand_3d": lambda: (torch.rand(2, 5, 5),),
         "rand_4d": lambda: (torch.rand(1, 2, 5, 5),),
     }
@@ -42,6 +44,11 @@ def forward(self, x: torch.Tensor):
 
 class MatMulCombo(torch.nn.Module):
     test_data_generators = {
+        "rand_rand_rand_2d": lambda: (
+            torch.rand(5, 5),
+            torch.rand(5, 2),
+            torch.rand(2, 5),
+        ),
         "rand_rand_rand_3d": lambda: (
             torch.rand(2, 5, 5),
             torch.rand(2, 5, 2),
@@ -122,7 +129,6 @@ def test_matmul_u55_INT(test_data: input_t1):
         test_data(),
         aten_op_mm,
         exir_op_mm,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -136,7 +142,6 @@ def test_matmul_single_input_u55_INT(test_data: input_t1):
         test_data(),
         aten_op_mm,
         exir_op_mm,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -150,7 +155,6 @@ def test_matmul_combo_u55_INT(test_data: input_t1):
         test_data(),
         aten_op_mm,
         exir_op_mm,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -164,7 +168,6 @@ def test_matmul_u85_INT(test_data: input_t1):
         test_data(),
         aten_op_mm,
         exir_op_mm,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -178,7 +181,6 @@ def test_matmul_single_input_u85_INT(test_data: input_t1):
         test_data(),
         aten_op_mm,
         exir_op_mm,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -192,7 +194,6 @@ def test_matmul_combo_u85_INT(test_data: input_t1):
         test_data(),
         aten_op_mm,
         exir_op_mm,
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index 7db56311837..559932848e4 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -142,7 +142,6 @@ def test_max_pool2d_u55_INT(test_data: torch.Tensor):
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     ).run()
 
 
@@ -155,7 +154,6 @@ def test_max_pool2d_u85_INT(test_data: torch.Tensor):
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     ).run()
 
 
diff --git a/backends/arm/test/ops/test_maximum.py b/backends/arm/test/ops/test_maximum.py
index eb0d4b86efc..ed3a5247d3d 100644
--- a/backends/arm/test/ops/test_maximum.py
+++ b/backends/arm/test/ops/test_maximum.py
@@ -61,7 +61,6 @@ def test_maximum_u55_INT(test_data: Tuple):
         Maximum(),
         test_data(),
         aten_op,
-        run_on_fvp=True,
     ).run()
 
 
@@ -72,7 +71,6 @@ def test_maximum_u85_INT(test_data: Tuple):
         Maximum(),
         test_data(),
         aten_op,
-        run_on_fvp=True,
     ).run()
 
 
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 061e8da14f1..970340c352b 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -4,7 +4,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -66,7 +65,6 @@ def test_adaptive_avg_pool2d_u55_INT(test_data):
         test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     ).run()
 
@@ -79,7 +77,6 @@ def test_adaptive_avg_pool2d_u85_INT(test_data):
         test_data(),
         AdaptiveAveragePool2d.aten_op,
         AdaptiveAveragePool2d.exir_op,
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     ).run()
 
@@ -115,7 +112,7 @@ class MeanDim(torch.nn.Module):
     test_data_suite: dict[str, tuple] = {
         "rank_1_keepdim": lambda: (
             torch.rand(7),
-            (0),
+            0,
             True,
         ),
         "rank_2_keepdim": lambda: (
@@ -168,6 +165,11 @@ class MeanDim(torch.nn.Module):
             (0, 1, 2, 3),
             True,
         ),
+        "rand_none_keepdim": lambda: (
+            torch.rand(1, 5, 7, 3),
+            None,
+            True,
+        ),
         "rank_1": lambda: (
             torch.rand(7),
             (-1),
@@ -280,7 +282,6 @@ def test_mean_dim_tosa_INT(test_data):
         (test_data,),
         [],  # Might be sum, avgpool, or both
         symmetric_io_quantization=True,
-        custom_path="MEANDIM",
     )
     pipeline.run()
 
@@ -301,7 +302,6 @@ def test_mean_dim_u55_INT(test_data):
         MeanDim(dim, keep_dim),
         (test_data,),
         [],  # Might be sum, avgpool, or both
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.add_stage_after(
@@ -321,7 +321,6 @@ def test_mean_dim_u85_INT(test_data):
         MeanDim(dim, keep_dim),
         (test_data,),
         [],  # Might be sum, avgpool, or both
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_minimum.py b/backends/arm/test/ops/test_minimum.py
index 88ae2c2b8da..3e87e64acbd 100644
--- a/backends/arm/test/ops/test_minimum.py
+++ b/backends/arm/test/ops/test_minimum.py
@@ -61,7 +61,6 @@ def test_minimum_u55_INT(test_data: Tuple):
         Minimum(),
         test_data(),
         aten_op,
-        run_on_fvp=True,
     ).run()
 
 
@@ -72,7 +71,6 @@ def test_minimum_u85_INT(test_data: Tuple):
         Minimum(),
         test_data(),
         aten_op,
-        run_on_fvp=True,
     ).run()
 
 
diff --git a/backends/arm/test/ops/test_mm.py b/backends/arm/test/ops/test_mm.py
index 1b76baaeff0..afb7a6d7d30 100644
--- a/backends/arm/test/ops/test_mm.py
+++ b/backends/arm/test/ops/test_mm.py
@@ -53,7 +53,6 @@ def test_mm_u55_INT(test_data: Tuple):
         MM(),
         test_data(),
         MM.aten_op,
-        run_on_fvp=True,
     ).run()
 
 
@@ -65,7 +64,6 @@ def test_mm_u85_INT(test_data: Tuple):
         test_data(),
         MM.aten_op,
         MM.exir_op,
-        run_on_fvp=True,
     ).run()
 
 
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index b2db55d90fd..02447e40c4e 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -8,7 +8,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -200,7 +199,6 @@ def test_mul_tensor_u55_INT(test_data: torch.Tensor):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -213,7 +211,6 @@ def test_mul_tensor_u85_INT(test_data: torch.Tensor):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -226,7 +223,6 @@ def test_mul_tensor_u55_INT_int32(test_data: torch.Tensor):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -240,7 +236,6 @@ def test_mul_tensor_u85_INT_int32(test_data: torch.Tensor):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -314,9 +309,6 @@ def get_symmetric_a16w8_mul_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 mul ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
     """Test mul operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -342,9 +334,6 @@ def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
     """Test mul operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -356,7 +345,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -370,9 +358,6 @@ def test_mul_tensor_16a8w_u55_INT16(test_data: input_t1):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 mul operations. See: https://github.com/pytorch/executorch/issues/13947"
-)
 def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1):
     """Test mul operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -384,7 +369,6 @@ def test_mul_tensor_16a8w_u85_INT16(test_data: input_t1):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_multihead_attention.py b/backends/arm/test/ops/test_multihead_attention.py
index 71cf076a157..cbc2ccb32f4 100644
--- a/backends/arm/test/ops/test_multihead_attention.py
+++ b/backends/arm/test/ops/test_multihead_attention.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import pytest
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -69,7 +68,6 @@ def test_multihead_attention_tosa_INT(test_data):
     "test_data",
     test_suite,
 )
-@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
 @common.XfailIfNoCorstone300
 def test_multihead_attention_u55_INT(test_data: input_t1):
     test_data, module = test_data()
@@ -79,7 +77,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1):
         [],
         [],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
         # TODO: Per-channel quantization is broken (MLETORCH-1144)
         per_channel_quantization=False,
     )
@@ -91,7 +88,6 @@ def test_multihead_attention_u55_INT(test_data: input_t1):
     "test_data",
     test_suite,
 )
-@pytest.mark.xfail(reason="MLETORCH-1102: Numerical issues on FVP")
 @common.XfailIfNoCorstone320
 def test_multihead_attention_u85_INT(test_data: input_t1):
     test_data, module = test_data()
@@ -101,7 +97,6 @@ def test_multihead_attention_u85_INT(test_data: input_t1):
         [],
         [],
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
         # TODO: Per-channel quantization is broken (MLETORCH-1144)
         per_channel_quantization=False,
     )
diff --git a/backends/arm/test/ops/test_ne.py b/backends/arm/test/ops/test_ne.py
index 60f07ad9fdd..e20953b64dc 100644
--- a/backends/arm/test/ops/test_ne.py
+++ b/backends/arm/test/ops/test_ne.py
@@ -171,7 +171,6 @@ def test_ne_tensor_u85_INT(test_module):
         test_module.get_inputs(),
         NotEqual.decomposed_ops,
         NotEqual.decomposed_exir_ops,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -192,7 +191,6 @@ def test_ne_scalar_u85_INT(test_module):
         test_module.get_inputs(),
         NotEqual.decomposed_ops,
         NotEqual.decomposed_exir_ops,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_neg.py b/backends/arm/test/ops/test_neg.py
index 395a4815b62..f0afe7bd23b 100644
--- a/backends/arm/test/ops/test_neg.py
+++ b/backends/arm/test/ops/test_neg.py
@@ -53,7 +53,10 @@ def test_neg_tosa_INT(test_data: input_t1):
 @common.XfailIfNoCorstone300
 def test_neg_u55_INT(test_data: input_t1):
     pipeline = EthosU55PipelineINT[input_t1](
-        Neg(), test_data, Neg.aten_op, Neg.exir_op, run_on_fvp=True
+        Neg(),
+        test_data,
+        Neg.aten_op,
+        Neg.exir_op,
     )
     pipeline.run()
 
@@ -62,7 +65,10 @@ def test_neg_u55_INT(test_data: input_t1):
 @common.XfailIfNoCorstone320
 def test_neg_u85_INT(test_data: input_t1):
     pipeline = EthosU85PipelineINT[input_t1](
-        Neg(), test_data, Neg.aten_op, Neg.exir_op, run_on_fvp=True
+        Neg(),
+        test_data,
+        Neg.aten_op,
+        Neg.exir_op,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index eb482bcee54..6fd8555b56b 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -85,7 +85,6 @@ def test_permute_u55_INT(test_data):
         (test_data,),
         aten_op,
         exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -99,7 +98,6 @@ def test_permute_u85_INT(test_data: torch.Tensor):
         (test_data,),
         aten_op,
         exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_pixel_shuffling.py b/backends/arm/test/ops/test_pixel_shuffling.py
new file mode 100644
index 00000000000..5aeb8b2d1bb
--- /dev/null
+++ b/backends/arm/test/ops/test_pixel_shuffling.py
@@ -0,0 +1,233 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import pytest
+
+import torch
+
+from executorch.backends.arm.constants import MAX_RANK
+
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+from torch import nn
+
+aten_op_pixel_unshuffle = "torch.ops.aten.pixel_unshuffle.default"
+exir_op_pixel_unshuffle = (
+    "executorch_exir_dialects_edge__ops_aten_pixel_unshuffle_default"
+)
+
+aten_op_pixel_shuffle = "torch.ops.aten.pixel_shuffle.default"
+exir_op_pixel_shuffle = "executorch_exir_dialects_edge__ops_aten_pixel_shuffle_default"
+
+input_t1 = Tuple[torch.Tensor]  # single positional input (1-tuple)
+
+max_rank_input_supported = MAX_RANK - 2
+
+
+class PixelUnShuffle(nn.Module):
+
+    upscale_factor = 2
+    test_data_generators = {
+        "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+        "test_4d": lambda: (torch.tensor([[[[10.0, 20.0], [30.0, 40.0]]]]),),
+        "test_3d": lambda: (torch.tensor([[[10.0, 20.0], [30.0, 40.0]]]),),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.space_to_depth = nn.PixelUnshuffle(self.upscale_factor)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if inputs.dim() > max_rank_input_supported:
+            raise RuntimeError(
+                f"Max rank of input for pixel_unshuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+            )
+        return self.space_to_depth(inputs)
+
+
+class PixelShuffle(nn.Module):
+
+    upscale_factor = 2
+    test_data_generators = {
+        "rand_4d": lambda: (torch.randn(1, 12, 64, 64),),
+        "test_4d": lambda: (torch.tensor([[[[10.0]], [[20.0]], [[30.0]], [[40.0]]]]),),
+        "test_3d": lambda: (torch.tensor([[[10.0]], [[20.0]], [[30.0]], [[40.0]]]),),
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.depth_to_space = nn.PixelShuffle(self.upscale_factor)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if inputs.dim() > max_rank_input_supported:
+            raise RuntimeError(
+                f"Max rank of input for pixel_shuffle is currently {max_rank_input_supported}, got {inputs.dim()}"
+            )
+        return self.depth_to_space(inputs)
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_FP(test_data: input_t1):
+    pipeline = TosaPipelineFP[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+def test_pixel_unshuffle_tosa_INT(test_data: input_t1):
+    pipeline = TosaPipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_FP(test_data: input_t1):
+    pipeline = TosaPipelineFP[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+def test_pixel_shuffle_tosa_INT(test_data: input_t1):
+    pipeline = TosaPipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_unshuffle_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_FP(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.SkipIfNoModelConverter
+def test_pixel_shuffle_vgf_INT(test_data: input_t1):
+    pipeline = VgfPipeline[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_unshuffle_u55_INT(test_data: input_t1):
+    pipeline = EthosU55PipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelUnShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_unshuffle_u85_INT(test_data: input_t1):
+    pipeline = EthosU85PipelineINT[input_t1](
+        PixelUnShuffle(),
+        test_data(),
+        aten_op_pixel_unshuffle,
+        exir_op_pixel_unshuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone300
+def test_pixel_shuffle_u55_INT(test_data: input_t1):
+    pipeline = EthosU55PipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", PixelShuffle.test_data_generators)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(reason="MLETORCH-1424: rand test fails")
+def test_pixel_shuffle_u85_INT(test_data: input_t1):
+    pipeline = EthosU85PipelineINT[input_t1](
+        PixelShuffle(),
+        test_data(),
+        aten_op_pixel_shuffle,
+        exir_op_pixel_shuffle,
+        run_on_fvp=True,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_pow.py b/backends/arm/test/ops/test_pow.py
index 016c3e97265..377d1355992 100644
--- a/backends/arm/test/ops/test_pow.py
+++ b/backends/arm/test/ops/test_pow.py
@@ -159,7 +159,6 @@ def test_pow_tensor_scalar_u55_INT(test_data: Pow_TensorScalar.input_t):
         (base,),
         Pow_TensorScalar.aten_op,
         Pow_TensorScalar.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -173,7 +172,6 @@ def test_pow_tensor_scalar_u85_INT(test_data: Pow_TensorScalar.input_t):
         (base,),
         Pow_TensorScalar.aten_op,
         Pow_TensorScalar.exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py
index 78edbb980e8..3e4d7c18b40 100644
--- a/backends/arm/test/ops/test_reciprocal.py
+++ b/backends/arm/test/ops/test_reciprocal.py
@@ -71,7 +71,6 @@ def test_reciprocal_u55_INT(test_data: torch.Tensor):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
@@ -84,7 +83,6 @@ def test_reciprocal_u85_INT(test_data: torch.Tensor):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
         symmetric_io_quantization=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_relu.py b/backends/arm/test/ops/test_relu.py
index 0b29bc24e75..fad6e7a9162 100644
--- a/backends/arm/test/ops/test_relu.py
+++ b/backends/arm/test/ops/test_relu.py
@@ -43,6 +43,28 @@ def forward(self, x):
         return self.relu(x)
 
 
+test_data_conv_relu = {
+    # (test_name, test_data)
+    "4d_randn_inplace=True": (lambda: (torch.randn(1, 64, 96, 96) * 1000, True)),
+    "4d_randn_inplace=False": (lambda: (torch.randn(1, 64, 96, 96) * 1000, False)),
+}
+
+
+class Conv2d_Relu_Add(torch.nn.Module):
+    def __init__(self, inplace: bool = True):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(
+            in_channels=64, out_channels=64, kernel_size=7, padding="same"
+        )
+        self.relu = torch.nn.ReLU(inplace=inplace)
+
+    def forward(self, x: torch.Tensor):
+        y = self.conv1(x)
+        z = self.relu(y)
+        out = x + z
+        return out
+
+
 @common.parametrize("test_data", test_data_suite)
 def test_relu_tosa_FP(test_data: torch.Tensor):
     pipeline = TosaPipelineFP[input_t1](
@@ -54,6 +76,35 @@ def test_relu_tosa_FP(test_data: torch.Tensor):
     pipeline.run()
 
 
+# Test the folding of Conv2D with ReLU
+@common.parametrize("test_data", test_data_conv_relu)
+def test_conv_relu_folding_tosa_INT(test_data: torch.Tensor):
+    input_data, inplace = test_data()
+    pipeline = TosaPipelineINT[input_t1](
+        Conv2d_Relu_Add(inplace=inplace),
+        (input_data,),
+        [],
+        [],
+    )
+    # We should have :
+    # 3 quantize_per_tensor nodes: input activation , output of the conv-relu sequence, out of the add
+    # 4 dequantize_per_tensor nodes: into the conv2d input, into the add, output of the conv-relu sequence, before returning
+    # 2 dequantize_per_channel nodes: one for the weights and another one for the bias
+    # In case of incorrect annotation of the ReLU, we get separate Q/DR around both the conv2d and the ReLU and
+    # therefore more quantize_per_tensor and dequantize_per_tensor nodes
+    pipeline.add_stage_after(
+        "quantize",
+        pipeline.tester.check_count,
+        {
+            "quantized_decomposed.quantize_per_tensor.default": 3,
+            "torch.ops.quantized_decomposed.dequantize_per_tensor.default": 4,
+            "quantized_decomposed.dequantize_per_channel.default": 2,
+        },
+        suffix="quant_nodes",
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", test_data_suite)
 def test_relu_tosa_INT(test_data: torch.Tensor):
     pipeline = TosaPipelineINT[input_t1](
@@ -66,25 +117,25 @@ def test_relu_tosa_INT(test_data: torch.Tensor):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_relu_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
         Relu(),
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=False,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_relu_u85_INT(test_data: torch.Tensor):
     pipeline = EthosU85PipelineINT[input_t1](
         Relu(),
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=False,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py
index 3236515b661..56986a54781 100644
--- a/backends/arm/test/ops/test_repeat.py
+++ b/backends/arm/test/ops/test_repeat.py
@@ -88,6 +88,7 @@ def test_repeat_tosa_INT(test_data: Tuple):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_repeat_u55_INT(test_data: Tuple):
     module, test_data = test_data()
     pipeline = EthosU55PipelineINT[input_t1](
@@ -95,12 +96,12 @@ def test_repeat_u55_INT(test_data: Tuple):
         test_data,
         module.aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_repeat_u85_INT(test_data: Tuple):
     module, test_data = test_data()
     pipeline = EthosU85PipelineINT[input_t1](
@@ -108,7 +109,6 @@ def test_repeat_u85_INT(test_data: Tuple):
         test_data,
         module.aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_rshift.py b/backends/arm/test/ops/test_rshift.py
index e97bfb840ae..f7a821e3a63 100644
--- a/backends/arm/test/ops/test_rshift.py
+++ b/backends/arm/test/ops/test_rshift.py
@@ -96,14 +96,13 @@ def test_bitwise_right_shift_tensor_tosa_INT_scalar(test_data):
 
 
 @common.parametrize("test_data", RshiftScalar.test_data)
-@XfailIfNoCorstone300
+@common.XfailIfNoCorstone300
 def test_bitwise_right_shift_tensor_u55_INT_scalar(test_data):
     pipeline = EthosU55PipelineINT[scalar_input_t](
         RshiftScalar(),
         test_data(),
         RshiftScalar.torch_op_INT,
         RshiftScalar.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
 
@@ -113,14 +112,13 @@ def test_bitwise_right_shift_tensor_u55_INT_scalar(test_data):
 
 
 @common.parametrize("test_data", RshiftScalar.test_data)
-@XfailIfNoCorstone320
+@common.XfailIfNoCorstone320
 def test_bitwise_right_shift_tensor_u85_INT_scalar(test_data):
     pipeline = EthosU85PipelineINT[scalar_input_t](
         RshiftScalar(),
         test_data(),
         RshiftScalar.torch_op_INT,
         RshiftScalar.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
@@ -188,7 +186,6 @@ def test_bitwise_right_shift_tensor_u55_INT(test_data):
         test_data(),
         RshiftTensor.torch_op,
         RshiftTensor.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
 
@@ -205,7 +202,6 @@ def test_bitwise_right_shift_tensor_u85_INT(test_data):
         test_data(),
         RshiftTensor.torch_op,
         RshiftTensor.exir_op,
-        run_on_fvp=True,
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
diff --git a/backends/arm/test/ops/test_rsqrt.py b/backends/arm/test/ops/test_rsqrt.py
index d146a83287e..23bb9dc1a4b 100644
--- a/backends/arm/test/ops/test_rsqrt.py
+++ b/backends/arm/test/ops/test_rsqrt.py
@@ -66,7 +66,6 @@ def test_rsqrt_u55_INT(test_tensor: torch.Tensor):
         test_tensor(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -79,7 +78,6 @@ def test_rsqrt_u85_INT(test_tensor: torch.Tensor):
         test_tensor(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_scalar_tensor.py b/backends/arm/test/ops/test_scalar_tensor.py
index 22c1cc0373d..d5e5b365da1 100644
--- a/backends/arm/test/ops/test_scalar_tensor.py
+++ b/backends/arm/test/ops/test_scalar_tensor.py
@@ -2,7 +2,6 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-
 import torch
 from executorch.backends.arm.test import common
 
@@ -86,7 +85,6 @@ def test_scalar_tensor_u55_INT(test_data):
         ScalarTensor(scalar, dtype),
         tuple(data),
         ScalarTensor.aten_op,
-        run_on_fvp=True,
     ).run()
 
 
@@ -98,7 +96,6 @@ def test_scalar_tensor_u85_INT(test_data):
         ScalarTensor(scalar, dtype),
         tuple(data),
         ScalarTensor.aten_op,
-        run_on_fvp=True,
     ).run()
 
 
diff --git a/backends/arm/test/ops/test_select.py b/backends/arm/test/ops/test_select.py
index 4c3887f1e18..23046c34fe4 100644
--- a/backends/arm/test/ops/test_select.py
+++ b/backends/arm/test/ops/test_select.py
@@ -110,7 +110,6 @@ def test_select_int_u55_INT_copy(test_data: Tuple):
         test_data(),
         aten_op_copy,
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -124,7 +123,6 @@ def test_select_int_u55_INT(test_data: Tuple):
         test_data(),
         aten_op_int,
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -151,7 +149,6 @@ def test_select_int_u85_INT_copy(test_data: Tuple):
         test_data(),
         aten_op_copy,
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
@@ -165,7 +162,6 @@ def test_select_int_u85_INT(test_data: Tuple):
         test_data(),
         aten_op_int,
         exir_ops=[],
-        run_on_fvp=True,
         use_to_edge_transform_and_lower=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid.py b/backends/arm/test/ops/test_sigmoid.py
index aac2ee1c9b1..a9b9ef11b48 100644
--- a/backends/arm/test/ops/test_sigmoid.py
+++ b/backends/arm/test/ops/test_sigmoid.py
@@ -141,25 +141,25 @@ def test_sigmoid_tosa_INT_3():
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_sigmoid_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sigmoid(),
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=False,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_sigmoid_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Sigmoid(),
         (test_data(),),
         aten_op,
         exir_op,
-        run_on_fvp=False,
     )
     pipeline.run()
 
@@ -324,7 +324,6 @@ def test_sigmoid_16a8w_u55_INT16(test_data: torch.Tensor):
         exir_op,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -352,7 +351,6 @@ def test_sigmoid_16a8w_u85_INT16(test_data: torch.Tensor):
         exir_op,
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_sigmoid_16bit.py b/backends/arm/test/ops/test_sigmoid_16bit.py
index ad8c49b234c..587ba99222a 100644
--- a/backends/arm/test/ops/test_sigmoid_16bit.py
+++ b/backends/arm/test/ops/test_sigmoid_16bit.py
@@ -125,6 +125,7 @@ def test_sigmoid_tosa_INT_add_sigmoid(test_data):
     "test_data",
     test_data_suite,
 )
+@common.XfailIfNoCorstone300
 def test_sigmoid_u55_INT(test_data):
     pipeline = OpNotSupportedPipeline(
         Sigmoid(),
@@ -141,6 +142,7 @@ def test_sigmoid_u55_INT(test_data):
     "test_data",
     test_data_suite,
 )
+@common.XfailIfNoCorstone300
 def test_sigmoid_u55_INT_add_sigmoid(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
@@ -163,7 +165,6 @@ def test_sigmoid_u85_INT(test_data):
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
-        run_on_fvp=True,
     )
     pipeline.change_args("quantize", get_16bit_sigmoid_quantizer())
     pipeline.run()
@@ -184,7 +185,6 @@ def test_sigmoid_u85_INT_add_sigmoid(test_data):
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
-        run_on_fvp=True,
     )
     pipeline.change_args("quantize", get_16bit_sigmoid_quantizer())
     pipeline.run()
diff --git a/backends/arm/test/ops/test_sigmoid_32bit.py b/backends/arm/test/ops/test_sigmoid_32bit.py
index 70863cd4757..389f1d8a278 100644
--- a/backends/arm/test/ops/test_sigmoid_32bit.py
+++ b/backends/arm/test/ops/test_sigmoid_32bit.py
@@ -131,6 +131,7 @@ def test_sigmoid_tosa_INT_add_sigmoid(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_sigmoid_u55_INT(test_data):
     pipeline = OpNotSupportedPipeline(
         Sigmoid(),
@@ -145,6 +146,7 @@ def test_sigmoid_u55_INT(test_data):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_sigmoid_u55_INT_add_sigmoid(test_data):
     pipeline = OpNotSupportedPipeline(
         SigmoidAddSigmoid(),
@@ -167,7 +169,6 @@ def test_sigmoid_u85_INT(test_data):
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
-        run_on_fvp=True,
     )
     pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
@@ -184,7 +185,6 @@ def test_sigmoid_u85_INT_add_sigmoid(test_data):
         (test_data(),),
         Sigmoid.aten_op,
         Sigmoid.exir_op,
-        run_on_fvp=True,
     )
     pipeline.change_args("quantize", get_32bit_sigmoid_quantizer())
     pipeline.run()
diff --git a/backends/arm/test/ops/test_silu.py b/backends/arm/test/ops/test_silu.py
index edc7d769be1..362358d0813 100644
--- a/backends/arm/test/ops/test_silu.py
+++ b/backends/arm/test/ops/test_silu.py
@@ -79,7 +79,9 @@ def test_silu_tosa_INT_inplace(test_data: input_t):
 def test_silu_u55_INT(test_data: input_t):
     silu_data = (test_data(), False)
     pipeline = EthosU55PipelineINT[input_t](
-        Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True
+        Silu(),
+        silu_data,
+        Silu.aten_op_INT,
     )
     pipeline.run()
 
@@ -89,7 +91,9 @@ def test_silu_u55_INT(test_data: input_t):
 def test_silu_u55_INT_inplace(test_data: input_t):
     silu_data = (test_data(), True)
     pipeline = EthosU55PipelineINT[input_t](
-        Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True
+        Silu(),
+        silu_data,
+        Silu.aten_op_INT,
     )
     pipeline.run()
 
@@ -99,7 +103,9 @@ def test_silu_u55_INT_inplace(test_data: input_t):
 def test_silu_u85_INT(test_data: input_t):
     silu_data = (test_data(), False)
     pipeline = EthosU85PipelineINT[input_t](
-        Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True
+        Silu(),
+        silu_data,
+        Silu.aten_op_INT,
     )
     pipeline.run()
 
@@ -109,7 +115,9 @@ def test_silu_u85_INT(test_data: input_t):
 def test_silu_u85_INT_inplace(test_data: input_t):
     silu_data = (test_data(), True)
     pipeline = EthosU85PipelineINT[input_t](
-        Silu(), silu_data, Silu.aten_op_INT, run_on_fvp=True
+        Silu(),
+        silu_data,
+        Silu.aten_op_INT,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_sin.py b/backends/arm/test/ops/test_sin.py
index 3ca593ad608..06d06e3b11d 100644
--- a/backends/arm/test/ops/test_sin.py
+++ b/backends/arm/test/ops/test_sin.py
@@ -61,25 +61,25 @@ def test_sin_tosa_INT(test_data: Tuple):
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone300
 def test_sin_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sin(),
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
 
 @common.parametrize("test_data", test_data_suite)
+@common.XfailIfNoCorstone320
 def test_sin_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Sin(),
         (test_data,),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py
index eafeb04320e..7e71a51899a 100644
--- a/backends/arm/test/ops/test_slice.py
+++ b/backends/arm/test/ops/test_slice.py
@@ -7,7 +7,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -34,11 +33,11 @@
 test_data_suite = {
     "ones_neg_3": lambda: (torch.ones(10), [(3, -3)]),
     "ones_neg_8": lambda: (torch.ones(10), [(-8, 3)]),
-    "ones_slice_2": lambda: (torch.ones(10, 10), [(1, 3), (3, None)]),
-    "ones_slice_3": lambda: (torch.ones(10, 10, 10), [(0, 7), (0, None), (0, 8)]),
+    "ones_slice_2": lambda: (torch.ones(10, 10), [(1, 3), (3, 10)]),
+    "ones_slice_3": lambda: (torch.ones(10, 10, 10), [(0, 7), (0, 10), (0, 8)]),
     "ones_slice_4": lambda: (
         torch.ones((1, 12, 10, 10)),
-        [(None, None), (None, 5), (3, 5), (4, 10)],
+        [(0, 1), (0, 5), (3, 5), (4, 10)],
     ),
 }
 
@@ -78,26 +77,32 @@ def test_slice_tensor_tosa_INT_nhwc(test_data: torch.Tensor):
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite)
+x_fails = {
+    "ones_slice_3": "MLETORCH-1402: Slice operator has incorrect number of inputs",
+    "ones_slice_4": "MLETORCH-1402: Slice operator has incorrect number of inputs",
+}
+
+
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone300
 def test_slice_tensor_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
         Slice(),
         test_data(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
 
-@common.parametrize("test_data", test_data_suite)
+@common.parametrize("test_data", test_data_suite, x_fails)
+@common.XfailIfNoCorstone320
 def test_slice_tensor_u85_INT(test_data: torch.Tensor):
     pipeline = EthosU85PipelineINT[input_t1](
         Slice(),
         test_data(),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
@@ -148,9 +153,6 @@ def get_symmetric_a16w8_slice_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", test_data_suite)
-@pytest.mark.xfail(
-    reason="missing int16 slice ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13976"
-)
 def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -176,9 +178,6 @@ def test_slice_tensor_16a8w_tosa_INT(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
-)
 def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -190,7 +189,6 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -204,9 +202,6 @@ def test_slice_tensor_16a8w_u55_INT16(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 slice operations"
-)
 def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor):
     """Test slice operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -218,7 +213,6 @@ def test_slice_tensor_16a8w_u85_INT16(test_data: torch.Tensor):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index 6b4455fc702..22bd919fccd 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -65,7 +65,11 @@ def test_softmax_tosa_INT(test_data):
 @common.XfailIfNoCorstone300
 def test_softmax_u55_INT(test_data):
     data, dim = test_data()
-    pipeline = EthosU55PipelineINT[input_t1](Softmax(dim), data, [], run_on_fvp=True)
+    pipeline = EthosU55PipelineINT[input_t1](
+        Softmax(dim),
+        data,
+        [],
+    )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
@@ -75,7 +79,11 @@ def test_softmax_u55_INT(test_data):
 @common.XfailIfNoCorstone320
 def test_softmax_u85_INT(test_data):
     data, dim = test_data()
-    pipeline = EthosU85PipelineINT[input_t1](Softmax(dim), data, [], run_on_fvp=True)
+    pipeline = EthosU85PipelineINT[input_t1](
+        Softmax(dim),
+        data,
+        [],
+    )
     pipeline.add_stage_after("quantize", pipeline.tester.check_not, [aten_op])
     pipeline.change_args("run_method_and_compare_outputs", qtol=1)
     pipeline.run()
diff --git a/backends/arm/test/ops/test_split.py b/backends/arm/test/ops/test_split.py
index 388e85762af..284c142a34e 100644
--- a/backends/arm/test/ops/test_split.py
+++ b/backends/arm/test/ops/test_split.py
@@ -132,17 +132,24 @@ def test_split_with_sizes_tosa_INT(test_data: input_t1):
     pipeline.run()
 
 
+x_fails = {
+    "split_3d_2_sizes_dim": "MLETORCH-1403: Split operator is running out of memory when reading input file",
+    "split_4d_2_sizes_dim_neg": "MLETORCH-1403: Split operator is running out of memory when reading input file",
+}
+
+
 @common.parametrize(
     "test_data",
     (Split.test_data | Split.test_data_list),
+    x_fails,
 )
+@common.XfailIfNoCorstone300
 def test_split_with_sizes_u55_INT(test_data: input_t1):
     pipeline = EthosU55PipelineINT[input_t1](
         Split(),
         test_data(),
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=False,
     )
     pipeline.run()
 
@@ -150,7 +157,9 @@ def test_split_with_sizes_u55_INT(test_data: input_t1):
 @common.parametrize(
     "test_data",
     (Split.test_data | Split.test_data_list),
+    x_fails,
 )
+@common.XfailIfNoCorstone320
 def test_split_with_sizes_u85_INT(test_data: input_t1):
 
     pipeline = EthosU85PipelineINT[input_t1](
@@ -158,7 +167,6 @@ def test_split_with_sizes_u85_INT(test_data: input_t1):
         test_data(),
         aten_ops=[],
         exir_ops=exir_op,
-        run_on_fvp=False,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_sqrt.py b/backends/arm/test/ops/test_sqrt.py
index 15e2dd45322..13a2366b17c 100644
--- a/backends/arm/test/ops/test_sqrt.py
+++ b/backends/arm/test/ops/test_sqrt.py
@@ -70,7 +70,6 @@ def test_sqrt_u55_INT(test_data: Sqrt.input_t):
         test_data(),
         Sqrt.aten_op_INT,
         Sqrt.exir_op_INT,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -83,7 +82,6 @@ def test_sqrt_u85_INT(test_data: Sqrt.input_t):
         test_data(),
         Sqrt.aten_op_INT,
         Sqrt.exir_op_INT,
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_squeeze.py b/backends/arm/test/ops/test_squeeze.py
index 0de51673496..3c2014cdcda 100644
--- a/backends/arm/test/ops/test_squeeze.py
+++ b/backends/arm/test/ops/test_squeeze.py
@@ -95,7 +95,6 @@ def test_squeeze_dim_u55_INT(test_data: Tuple):
         test_data(),
         aten_ops="torch.ops.aten.squeeze.default",
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -108,7 +107,6 @@ def test_squeeze_dim_u85_INT(test_data: Tuple):
         test_data(),
         aten_ops="torch.ops.aten.squeeze.default",
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -174,7 +172,6 @@ def test_squeeze_dim_u55_INT_2(test_data: Tuple):
         test_data(),
         aten_ops="torch.ops.aten.squeeze.dim",
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -187,7 +184,6 @@ def test_squeeze_dim_u85_INT_2(test_data: Tuple):
         test_data(),
         aten_ops="torch.ops.aten.squeeze.dim",
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -253,7 +249,6 @@ def test_squeeze_dims_u55_INT(test_data: Tuple):
         test_data(),
         aten_ops="torch.ops.aten.squeeze.dims",
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -266,7 +261,6 @@ def test_squeeze_dims_u85_INT(test_data: Tuple):
         test_data(),
         aten_ops="torch.ops.aten.squeeze.dims",
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_stack.py b/backends/arm/test/ops/test_stack.py
new file mode 100644
index 00000000000..873a599992a
--- /dev/null
+++ b/backends/arm/test/ops/test_stack.py
@@ -0,0 +1,150 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.test_pipeline import (
+    EthosU55PipelineINT,
+    EthosU85PipelineINT,
+    TosaPipelineFP,
+    TosaPipelineINT,
+    VgfPipeline,
+)
+
+test_data_suite = {
+    # (test_name, test_data)
+    "ones_two_tensors": lambda: ((torch.ones(1), torch.ones(1)), 0),
+    "ones_and_rand_three_tensors": lambda: (
+        (torch.ones(1, 2), torch.randn(1, 2), torch.randn(1, 2)),
+        1,
+    ),
+    "ones_and_rand_four_tensors": lambda: (
+        (
+            torch.ones(1, 2, 5),
+            torch.randn(1, 2, 5),
+            torch.randn(1, 2, 5),
+            torch.randn(1, 2, 5),
+        ),
+        -1,
+    ),
+    "rand_two_tensors": lambda: (
+        (torch.randn(2, 2, 4), torch.randn(2, 2, 4)),
+        2,
+    ),
+    "rand_two_tensors_dim_0": lambda: (
+        (torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)),
+    ),
+    "rand_two_tensors_dim_2": lambda: (
+        (torch.randn(2, 2, 3, 5), torch.randn(2, 2, 3, 5)),
+        2,
+    ),
+    "rand_large": lambda: (
+        (
+            10000 * torch.randn(2, 3, 1, 4),
+            torch.randn(2, 3, 1, 4),
+            torch.randn(2, 3, 1, 4),
+        ),
+        -3,
+    ),
+}
+
+
+class Stack(nn.Module):
+    aten_op = "torch.ops.aten.stack.default"
+    exir_op = "executorch_exir_dialects_edge__ops_aten_cat_default"
+
+    def forward(self, n: tuple[torch.Tensor, ...], dim: int = 0):
+        return torch.stack(n, dim)
+
+
+input_t1 = Tuple[torch.Tensor]
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_FP(test_module: input_t1):
+    test_data = test_module()
+    pipeline = TosaPipelineFP[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_module", test_data_suite)
+def test_stack_tosa_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = TosaPipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone300
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u55_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = EthosU55PipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_ops=Stack.aten_op,
+        exir_ops=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.XfailIfNoCorstone320
+@common.parametrize("test_module", test_data_suite)
+def test_stack_u85_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = EthosU85PipelineINT[input_t1](
+        Stack(),
+        test_data,
+        aten_ops=Stack.aten_op,
+        exir_ops=Stack.exir_op,
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_FP(test_module: input_t1):
+    test_data = test_module()
+    pipeline = VgfPipeline[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        tosa_version="TOSA-1.0+FP",
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
+
+
+@common.SkipIfNoModelConverter
+@common.parametrize("test_module", test_data_suite)
+def test_stack_vgf_INT(test_module: input_t1):
+    test_data = test_module()
+    pipeline = VgfPipeline[input_t1](
+        Stack(),
+        test_data,
+        aten_op=Stack.aten_op,
+        exir_op=Stack.exir_op,
+        tosa_version="TOSA-1.0+INT",
+        use_to_edge_transform_and_lower=False,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index c691506beb2..68b6ad5fb93 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -10,8 +10,12 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_a16w8_quantization_config,
+    TOSAQuantizer,
+)
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
     EthosU85PipelineINT,
@@ -19,6 +23,8 @@
     TosaPipelineINT,
     VgfPipeline,
 )
+from executorch.backends.arm.tosa import TosaSpecification
+from executorch.backends.xnnpack.test.tester import Quantize
 
 aten_op = "torch.ops.aten.sub.Tensor"
 exir_op = "executorch_exir_dialects_edge__ops_aten_sub_Tensor"
@@ -73,6 +79,11 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x - y
 
 
+class SubAlpha(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return torch.sub(x, y, alpha=5)
+
+
 class SubTan(torch.nn.Module):
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
@@ -109,6 +120,18 @@ def test_sub_tensor_tosa_FP_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
+@common.parametrize("test_data", sub_tan_test_data)
+def test_sub_tensor_tosa_FP_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction with alpha (TOSA FP)"""
+    pipeline = TosaPipelineFP[input_t2](
+        SubAlpha(),
+        test_data(),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", sub_test_data)
 def test_sub_tensor_tosa_INT(test_data):
     """Test Subtraction (TOSA INT)"""
@@ -132,6 +155,15 @@ def test_sub_tensor_tosa_INT_3(test_data: Tuple[torch.Tensor, torch.Tensor]):
     pipeline.run()
 
 
+@common.parametrize("test_data", sub_tan_test_data)
+def test_sub_tensor_tosa_INT_alpha(test_data: Tuple[torch.Tensor, torch.Tensor]):
+    """Test Two-Operand Subtraction with alpha (TOSA INT)"""
+    pipeline = TosaPipelineINT[input_t2](
+        SubAlpha(), test_data(), aten_op, exir_op, qtol=0
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_data", sub_test_data)
 @common.XfailIfNoCorstone300
 def test_sub_tensor_u55_INT(test_data):
@@ -141,7 +173,6 @@ def test_sub_tensor_u55_INT(test_data):
         test_data(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -155,7 +186,6 @@ def test_sub_tensor_u55_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
         test_data(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -169,7 +199,6 @@ def test_sub_tensor_u85_INT_2(test_data):
         test_data(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -183,7 +212,6 @@ def test_sub_tensor_u85_INT(test_data: Tuple[torch.Tensor, torch.Tensor]):
         test_data(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -242,3 +270,96 @@ def test_sub_tensor_vgf_INT_2(test_data: Tuple[torch.Tensor, torch.Tensor]):
         tosa_version="TOSA-1.0+INT",
     )
     pipeline.run()
+
+
+def get_symmetric_a16w8_sub_quantizer(per_channel_quantization=False):
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT+int16"),
+    }
+
+    quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
+    quantizer.set_global(
+        get_symmetric_a16w8_quantization_config(is_per_channel=per_channel_quantization)
+    )
+
+    return Quantize(
+        quantizer,
+        get_symmetric_a16w8_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+
+
+@common.parametrize("test_data", sub_test_data)
+def test_sub_tensor_16a8w_tosa_INT(test_data: input_t1):
+    """Test sub operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
+    per_channel_quantization = False
+
+    pipeline = TosaPipelineINT[input_t1](
+        Sub(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        tosa_extensions=["int16"],
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_sub_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data)
+@common.XfailIfNoCorstone300
+def test_sub_tensor_16a8w_u55_INT16(test_data: input_t1):
+    """Test sub operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
+    per_channel_quantization = False
+
+    pipeline = EthosU55PipelineINT[input_t1](
+        Sub(),
+        test_data(),
+        aten_op,
+        exir_op,
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_sub_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", sub_test_data)
+@common.XfailIfNoCorstone320
+def test_sub_tensor_16a8w_u85_INT16(test_data: input_t1):
+    """Test sub operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
+    per_channel_quantization = False
+
+    pipeline = EthosU85PipelineINT[input_t1](
+        Sub(),
+        test_data(),
+        aten_op,
+        exir_op,
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        run_on_fvp=True,
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_sub_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 9308315f76d..13c1e029032 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -72,7 +72,6 @@ def test_view_u55_INT_1_0(test_data: Tuple):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -85,7 +84,6 @@ def test_view_u85_INT_1_0(test_data: Tuple):
         test_data(),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -94,7 +92,11 @@ def test_view_u85_INT_1_0(test_data: Tuple):
 @common.SkipIfNoModelConverter
 def test_sum_dim_intlist_vgf_FP(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
-        Sum(), test_data(), aten_op, tosa_version="TOSA-1.0+FP"
+        Sum(),
+        test_data(),
+        aten_op,
+        tosa_version="TOSA-1.0+FP",
+        run_on_vulkan_runtime=True,
     )
     pipeline.run()
 
@@ -107,6 +109,7 @@ def test_sum_dim_intlist_vgf_INT(test_data: input_t1):
         test_data(),
         aten_op,
         tosa_version="TOSA-1.0+INT",
+        run_on_vulkan_runtime=True,
     )
     pipeline.run()
 
@@ -119,7 +122,7 @@ def test_sum_dim_intlist_vgf_INT(test_data: input_t1):
 
 
 @common.parametrize("test_data", reject_inputs)
-def test_view_u55_INT_not_delegated(test_data: Tuple):
+def test_view_u55_INT_failure_set(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sum(),
         test_data(),
diff --git a/backends/arm/test/ops/test_tanh.py b/backends/arm/test/ops/test_tanh.py
index f3f4df31d0e..8dc967c01d7 100644
--- a/backends/arm/test/ops/test_tanh.py
+++ b/backends/arm/test/ops/test_tanh.py
@@ -77,7 +77,6 @@ def test_tanh_u55_INT(test_data: Tuple):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -90,7 +89,6 @@ def test_tanh_u85_INT(test_data: Tuple):
         (test_data(),),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -178,7 +176,6 @@ def test_tanh_16a8w_u55_INT16(test_data: torch.Tensor):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -206,7 +203,6 @@ def test_tanh_16a8w_u85_INT16(test_data: torch.Tensor):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py
index 5c01788c805..1fdc4619131 100644
--- a/backends/arm/test/ops/test_to_copy.py
+++ b/backends/arm/test/ops/test_to_copy.py
@@ -192,20 +192,15 @@ def test_to_vgf_INT(test_data: Tuple):
     ),
 }
 
-redundant_xfails_FP = {
+redundant_xfails = {
     "rand_fp16_fp16": "FP16 is not supported",
     "rand_int8_int8": "Tracing graph with quantized input is not supported.",
     "rand_int16_int16": "Tracing graph with quantized input is not supported.",
 }
 
-redundant_xfails_INT = {
-    "rand_fp16_fp16": "FP16 is not supported",
-    "rand_int8_int8": "Tracing graph with quantized input is not supported.",
-}
-
 
 @common.parametrize(
-    "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails_FP
+    "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails
 )
 def test_to_tosa_FP_REDUNDANT_CAST(test_data: Tuple):
     test_tensor, new_dtype = test_data()
@@ -220,7 +215,7 @@ def test_to_tosa_FP_REDUNDANT_CAST(test_data: Tuple):
 
 
 @common.parametrize(
-    "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails_INT
+    "test_data", _TO_COPY_TEST_DATA_REDUNDANT_CAST, xfails=redundant_xfails
 )
 def test_to_tosa_INT_REDUNDANT_CAST(test_data: Tuple):
     test_tensor, new_dtype = test_data()
@@ -244,3 +239,32 @@ def test_to_tosa_INT_not_delegated_REDUNDANT_CAST(test_data: Tuple):
         non_delegated_ops={},  # These are removed outside of the Arm backend so the graph is empty
     )
     pipeline.run()
+
+
+_TO_COPY_DATA_INT_U55_REJECT = {
+    "rand_bool_int8": lambda: (
+        torch.randint(0, 2, (1, 2, 3, 4), dtype=torch.bool),
+        torch.int8,
+    ),
+    "rand_int16_bool": lambda: (
+        torch.randint(-1000, 1000, (1, 2, 3, 4), dtype=torch.int16),
+        torch.bool,
+    ),
+    "rand_int32_int8": lambda: (
+        torch.randint(-1000, 1000, (1, 2, 3, 4), dtype=torch.int32),
+        torch.int8,
+    ),
+}
+
+
+@common.parametrize("test_data", _TO_COPY_DATA_INT_U55_REJECT)
+def test_to_u55_INT(test_data: Tuple):
+    test_tensor, new_dtype = test_data()
+    pipeline = OpNotSupportedPipeline[input_t1](
+        Cast(new_dtype),
+        (test_tensor,),
+        u55_subset=True,
+        quantize=True,
+        non_delegated_ops={},  # These are removed outside of the Arm backend so the graph is empty
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_unary_combos.py b/backends/arm/test/ops/test_unary_combos.py
index db442d2d8d0..bfeb9b59e80 100644
--- a/backends/arm/test/ops/test_unary_combos.py
+++ b/backends/arm/test/ops/test_unary_combos.py
@@ -109,7 +109,10 @@ def test_unary_combos_tosa_INT(model_cls):
 def test_unary_combos_u55_INT(model_cls):
     m, inputs, exir = _build(model_cls)
     p = EthosU55PipelineINT[Tensor1](
-        m, inputs, aten_ops=[], exir_ops=exir, run_on_fvp=True
+        m,
+        inputs,
+        aten_ops=[],
+        exir_ops=exir,
     )
     p.run()
 
@@ -119,7 +122,10 @@ def test_unary_combos_u55_INT(model_cls):
 def test_unary_combos_u85_INT(model_cls):
     m, inputs, exir = _build(model_cls)
     p = EthosU85PipelineINT[Tensor1](
-        m, inputs, aten_ops=[], exir_ops=exir, run_on_fvp=True
+        m,
+        inputs,
+        aten_ops=[],
+        exir_ops=exir,
     )
     p.run()
 
diff --git a/backends/arm/test/ops/test_unsqueeze.py b/backends/arm/test/ops/test_unsqueeze.py
index 9da1a352ebb..c76c1236ab3 100644
--- a/backends/arm/test/ops/test_unsqueeze.py
+++ b/backends/arm/test/ops/test_unsqueeze.py
@@ -65,7 +65,6 @@ def test_unsqueeze_u55_INT(test_tensor: torch.Tensor):
         (*test_tensor, 0),
         aten_op,
         exir_ops=[],
-        run_on_fvp=False,
     )
     pipeline.run()
 
@@ -78,7 +77,6 @@ def test_unsqueeze_u85_INT(test_tensor: torch.Tensor):
         (*test_tensor, 0),
         aten_op,
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_upsample_bilinear2d.py b/backends/arm/test/ops/test_upsample_bilinear2d.py
index 95e69bc5204..1edba708f1f 100644
--- a/backends/arm/test/ops/test_upsample_bilinear2d.py
+++ b/backends/arm/test/ops/test_upsample_bilinear2d.py
@@ -259,7 +259,6 @@ def test_upsample_bilinear2d_vec_U85_INT_Upsample(test_data: input_t1):
         Upsample(size, scale_factor),
         (test_data,),
         aten_op,
-        run_on_fvp=True,
         qtol=1,
         use_to_edge_transform_and_lower=True,
     )
@@ -279,7 +278,6 @@ def test_upsample_bilinear2d_vec_U85_INT_Interpolate(
         Interpolate(size, scale_factor),
         (test_data,),
         aten_op,
-        run_on_fvp=True,
         qtol=1,
         use_to_edge_transform_and_lower=True,
     )
@@ -299,7 +297,6 @@ def test_upsample_bilinear2d_vec_U85_INT_UpsamplingBilinear2d(
         UpsamplingBilinear2d(size, scale_factor),
         (test_data,),
         aten_op,
-        run_on_fvp=True,
         qtol=1,
         use_to_edge_transform_and_lower=True,
     )
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index 9567f90c480..9f1c437fc65 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -194,7 +194,6 @@ def test_var_dim_u55_INT_no_dim(test_data: Tuple):
         (test_data,),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -208,7 +207,6 @@ def test_var_dim_u85_INT_no_dim(test_data: Tuple):
         (test_data,),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -276,7 +274,6 @@ def test_var_dim_u55_INT(test_data: Tuple):
         (test_data,),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -290,7 +287,6 @@ def test_var_dim_u85_INT(test_data: Tuple):
         (test_data,),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -357,7 +353,6 @@ def test_var_dim_u55_INT_correction(test_data: Tuple):
         (test_data,),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -371,7 +366,6 @@ def test_var_dim_u85_INT_correction(test_data: Tuple):
         (test_data,),
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py
index ed942c07aa1..3e706ae1cac 100644
--- a/backends/arm/test/ops/test_view.py
+++ b/backends/arm/test/ops/test_view.py
@@ -9,7 +9,6 @@
 
 from typing import Tuple
 
-import pytest
 import torch
 from executorch.backends.arm.quantizer.arm_quantizer import (
     get_symmetric_a16w8_quantization_config,
@@ -180,9 +179,6 @@ def get_symmetric_a16w8_view_quantizer(per_channel_quantization=False):
 
 
 @common.parametrize("test_data", View.needs_transpose_tests)
-@pytest.mark.xfail(
-    reason="missing int16 view ops support; fails at TOSA reference model with Unsupported operation type or rank. See: https://github.com/pytorch/executorch/issues/13977"
-)
 def test_view_16a8w_tosa_INT(test_data: Tuple):
     """Test view operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -209,9 +205,6 @@ def test_view_16a8w_tosa_INT(test_data: Tuple):
 
 @common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone300
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 view operations"
-)
 def test_view_16a8w_u55_INT16(test_data: Tuple):
     """Test view operation with 16A8W quantization on U55 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -224,7 +217,6 @@ def test_view_16a8w_u55_INT16(test_data: Tuple):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
@@ -238,9 +230,6 @@ def test_view_16a8w_u55_INT16(test_data: Tuple):
 
 @common.parametrize("test_data", View.needs_transpose_tests)
 @common.XfailIfNoCorstone320
-@pytest.mark.xfail(
-    reason="Vela compilation fails with 'Invalid arguments' for int16 view operations"
-)
 def test_view_16a8w_u85_INT16(test_data: Tuple):
     """Test view operation with 16A8W quantization on U85 (16-bit activations, 8-bit weights)"""
     per_channel_quantization = False
@@ -253,7 +242,6 @@ def test_view_16a8w_u85_INT16(test_data: Tuple):
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
         use_to_edge_transform_and_lower=True,
-        run_on_fvp=True,
     )
 
     pipeline.change_args(
diff --git a/backends/arm/test/ops/test_where.py b/backends/arm/test/ops/test_where.py
index ea036d26361..a35a9fc3b7d 100644
--- a/backends/arm/test/ops/test_where.py
+++ b/backends/arm/test/ops/test_where.py
@@ -139,8 +139,11 @@ def scalar_condition(input: torch.Tensor):
 
 test_modules_FP = {
     **test_modules_common,
-    "float32_tensor_cond_tuple_dtype": lambda: float32_tensor_cond_tuple_dtype,
     "float32_tensor_cond_tuple_dtype_bool": lambda: float32_tensor_cond_tuple_dtype_bool,
+}
+
+test_modules_FP_unsupported_dtype = {
+    "float32_tensor_cond_tuple_dtype": lambda: float32_tensor_cond_tuple_dtype,
     "int32_scalar_cond": lambda: int32_scalar_cond,
 }
 
@@ -162,6 +165,17 @@ def test_where_self_tosa_FP(test_module):
     pipeline.run()
 
 
+@common.parametrize("test_module", test_modules_FP_unsupported_dtype)
+def test_where_self_tosa_FP_unsupported_dtype(test_module):
+    pipeline = OpNotSupportedPipeline[input_t](
+        test_module(),
+        test_module().get_inputs(),
+        {exir_op: 1},
+        n_expected_delegates=1,  # condition can be delegated
+    )
+    pipeline.run()
+
+
 @common.parametrize("test_module", test_modules_INT)
 def test_where_self_tosa_INT(test_module):
     pipeline = TosaPipelineINT[input_t](
@@ -212,7 +226,6 @@ def test_where_self_u85_INT(test_module):
         test_module().get_inputs(),
         aten_op,
         exir_op,
-        run_on_fvp=True,
         symmetric_io_quantization=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
new file mode 100644
index 00000000000..096c90d330d
--- /dev/null
+++ b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
@@ -0,0 +1,77 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm._passes import (
+    FoldAndAnnotateQParamsPass,
+    InsertRescaleInt32Pass,
+)
+from executorch.backends.arm.test.tester.test_pipeline import PassPipeline
+
+
+class NeedsRescaleOps(torch.nn.Module):
+    """A module containing ops that require INT32 inputs/outputs."""
+
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        a = torch.maximum(x, y)
+        b = torch.abs(a)
+        c = a > b
+        return c
+
+    def get_inputs(self, dtype) -> input_t:
+        if dtype == torch.float32:
+            return (torch.rand(1, 3, 5, 6), torch.rand(1, 3, 5, 6))
+        elif dtype == torch.int32:
+            return (
+                torch.randint(3, 5, (3,), dtype=torch.int32),
+                torch.randint(3, 5, (3,), dtype=torch.int32),
+            )
+        else:
+            raise ValueError("Not a valid input dtype for model")
+
+
+def test_insert_rescales():
+    module = NeedsRescaleOps()
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+    ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    ops_after = {
+        # "number of op nodes with i8 output" + "number of i8 node inputs"
+        "executorch_exir_dialects_backend__ops_tosa_RESCALE_default": 2
+        + 5,
+    }
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(torch.float32),
+        quantize=True,
+        ops_not_before_pass=ops_not_before,
+        ops_after_pass=ops_after,
+        pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
+
+
+def test_dont_insert_rescales():
+    module = NeedsRescaleOps()
+    input_t = Tuple[torch.Tensor, torch.Tensor]
+    ops_not_before = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    # All inputs are already i32. Rescales should not be added.
+    ops_not_after = {"executorch_exir_dialects_backend__ops_tosa_RESCALE_default"}
+    pipeline = PassPipeline[input_t](
+        module,
+        module.get_inputs(torch.int32),
+        ops_not_before_pass=ops_not_before,
+        ops_not_after_pass=ops_not_after,
+        pass_list=[FoldAndAnnotateQParamsPass, InsertRescaleInt32Pass],
+    )
+    pipeline.pop_stage("run_method_and_compare_outputs")
+    pipeline.run()
diff --git a/backends/arm/test/passes/test_rescale_pass.py b/backends/arm/test/passes/test_rescale_pass.py
index 3baa03fde65..9774ebd2fcd 100644
--- a/backends/arm/test/passes/test_rescale_pass.py
+++ b/backends/arm/test/passes/test_rescale_pass.py
@@ -183,7 +183,6 @@ def test_quantized_rescale_u55(test_data: tuple[torch.Tensor, torch.Tensor]):
         test_data=test_data,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
 
@@ -199,6 +198,5 @@ def test_quantized_rescale_u85(test_data: tuple[torch.Tensor, torch.Tensor]):
         test_data=test_data,
         aten_ops=[],
         exir_ops=[],
-        run_on_fvp=True,
     )
     pipeline.run()
diff --git a/backends/arm/test/passes/test_to_tosa_memory_format.py b/backends/arm/test/passes/test_to_tosa_memory_format.py
index 1e9b8ffc63d..643a3bf5733 100644
--- a/backends/arm/test/passes/test_to_tosa_memory_format.py
+++ b/backends/arm/test/passes/test_to_tosa_memory_format.py
@@ -6,7 +6,10 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm._passes import ToTosaMemoryFormatPass
+from executorch.backends.arm._passes import (
+    AnnotateOutputDimOrderPass,
+    ToTosaMemoryFormatPass,
+)
 
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.test_pipeline import (
@@ -177,7 +180,10 @@ def test_to_tosa_memory_format_tosa_INT(module):
         ops_after_pass=module.ops_after_pass,
         ops_not_after_pass=module.ops_not_after_pass,
         pass_list=[RemoveGetItemPass],
-        passes_with_exported_program=[ToTosaMemoryFormatPass],
+        passes_with_exported_program=[
+            AnnotateOutputDimOrderPass,
+            ToTosaMemoryFormatPass,
+        ],
     )
     pipeline.pop_stage(
         "run_method_and_compare_outputs"
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 1b59b186a2e..3d002eff25e 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -13,11 +13,19 @@
 
 from pathlib import Path
 
+from types import NoneType
 from typing import Any, cast, Dict, List, Literal, Optional, Tuple
 
 import numpy as np
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.constants import (
+    NHWC_INVERSE_ORDER,
+    NHWC_ORDER,
+    NNHWC_INVERSE_ORDER,
+    NNHWC_ORDER,
+)
 
 from executorch.backends.arm.ethosu import EthosUCompileSpec
 from executorch.backends.arm.test.conftest import is_option_enabled
@@ -157,6 +165,36 @@ def get_output_quantization_params(
     return quant_params
 
 
+def torch_tensor_to_numpy(tensor: torch.Tensor) -> np.ndarray:
+    dtype = _torch_to_numpy_dtype_dict[tensor.dtype]
+    array = tensor.detach().numpy().astype(dtype)
+    dim_order = tensor.dim_order()
+    if dim_order == NHWC_ORDER:
+        a = array.transpose(NHWC_ORDER)
+        return a
+    elif dim_order == NNHWC_ORDER:
+        return array.transpose(NNHWC_ORDER)
+    else:
+        return array
+
+
+def numpy_to_torch_tensor(array: np.ndarray, output_node: Node) -> torch.Tensor:
+    output_tensor = get_first_fake_tensor(output_node)
+    shape = output_tensor.shape
+    dim_order = output_tensor.dim_order()
+    if dim_order == NHWC_ORDER:
+        shape_with_dim_order = [shape[i] for i in NHWC_ORDER]
+        tensor = torch.from_numpy(array).reshape(shape_with_dim_order)
+        return tensor.permute(NHWC_INVERSE_ORDER).to(memory_format=torch.channels_last)
+    elif dim_order == NNHWC_ORDER:
+        shape_with_dim_order = [shape[i] for i in NNHWC_ORDER]
+        tensor = torch.from_numpy(array).reshape(shape_with_dim_order)
+        return tensor.permute(NNHWC_INVERSE_ORDER).to(memory_format=torch.channels_last)
+    else:
+        tensor = torch.from_numpy(array).reshape(shape)
+        return tensor
+
+
 class TosaReferenceModelDispatch(TorchFunctionMode):
     """A context manager for executing call_delegate nodes using the reference model"""
 
@@ -168,7 +206,8 @@ def _tosa_dispatch(self, lowered_backend_module: LoweredBackendModule, inputs):
         tosa_buffer = lowered_backend_module.processed_bytes
         compile_spec = TosaCompileSpec.from_list(lowered_backend_module.compile_specs)
 
-        return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs)
+        output_node = lowered_backend_module.original_module.graph.output_node()
+        return run_tosa_graph(tosa_buffer, compile_spec.tosa_spec, inputs, output_node)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         super().__exit__(exc_type, exc_val, exc_tb)
@@ -190,6 +229,22 @@ def __torch_function__(self, func, types, args=..., kwargs=None):
                 )
 
         kwargs = kwargs or {}
+
+        # This is a hack since Q/DQ ops does not handle channels last input correctly: the simplest and most robust
+        # workaround is to simply run them in channels first format and then convert back to channels last.
+        if func in (
+            torch.ops.quantized_decomposed.quantize_per_tensor.out,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.out,
+            torch.ops.quantized_decomposed.quantize_per_channel.out,
+            torch.ops.quantized_decomposed.dequantize_per_channel.out,
+        ):
+
+            input_dim_order = args[0].dim_order()
+            if input_dim_order in (NHWC_ORDER, NNHWC_ORDER):
+                args = [args[0].to(memory_format=torch.contiguous_format), *args[1:]]
+                res = func(*args, **kwargs)
+                return res.to(memory_format=torch.channels_last)
+
         return func(*args, **kwargs)
 
 
@@ -244,14 +299,13 @@ def get_output_from_file(
     output_np = []
     output_node = exported_program.graph_module.graph.output_node()
     for i, node in enumerate(output_node.args[0]):
-        output_shape = node.meta["val"].shape
         output_dtype = node.meta["val"].dtype
         tosa_ref_output = np.fromfile(
             os.path.join(intermediate_path, f"{output_base_name}-{i}.bin"),
             _torch_to_numpy_dtype_dict[output_dtype],
         )
 
-        output_np.append(torch.from_numpy(tosa_ref_output).reshape(output_shape))
+        output_np.append(numpy_to_torch_tensor(tosa_ref_output, node))
     return tuple(output_np)
 
 
@@ -437,11 +491,14 @@ def prep_data_for_save(
     quant_param: Optional[QuantizationParams] = None,
 ):
     if isinstance(data, torch.Tensor):
-        data_np = np.array(data.detach(), order="C").astype(
-            _torch_to_numpy_dtype_dict[data.dtype]
-        )
+        data_np = torch_tensor_to_numpy(data)
+    elif isinstance(data, (int, float, bool, NoneType)):
+        return np.array(data)
     else:
-        data_np = np.array(data)
+        raise RuntimeError(
+            f"Input dtype {type(data)} could not be converted to numpy array."
+        )
+
     if quant_param is not None:
         assert quant_param.node_name in input_name, (
             f"The quantization params name '{quant_param.node_name}' does not "
@@ -455,30 +512,8 @@ def prep_data_for_save(
                 f"{quant_param.dtype}".replace("torch.", "")
             )  # Use string format of dtype to convert to numpy dtype
         )
-    return data_np
-
-
-def save_npy(
-    path: str,
-    data,
-    input_name: str,
-    quant_param: Optional[QuantizationParams] = None,
-) -> str:
-    """Serializes and saves 'data' as a .npy file, possibly quantizing it before.
-
-    Parameters:
-        path: the directory where to save the data.
-        data: the data to save.
-        input_name: the name of the file, without file-ending.
-        quant_param: the parameters to use for quantization.
-    Returns:
-        the full file path of the output.
-    """
-    data_np = prep_data_for_save(data, input_name, quant_param)
-    file_path = os.path.join(path, input_name + ".npy")
-    np.save(file_path, data_np, allow_pickle=False)
 
-    return file_path
+    return data_np
 
 
 def save_bytes(
@@ -691,9 +726,12 @@ def run_tosa_graph(
     graph: Any,
     tosa_version: TosaSpecification,
     inputs: list[torch.Tensor],
+    output_node: Node,
 ) -> list[torch.Tensor]:
     """Runs the TOSA reference model with inputs and returns the result."""
-    inputs_np = [input.numpy() for input in inputs]
+
+    # Convert tensors to numpy arrays with correct dim_order
+    inputs_np = [torch_tensor_to_numpy(input_tensor) for input_tensor in inputs]
 
     if isinstance(tosa_version, Tosa_1_00):
         import tosa_reference_model as reference_model
@@ -715,7 +753,13 @@ def run_tosa_graph(
         status == reference_model.GraphStatus.TOSA_VALID
     ), "Non-valid TOSA given to reference model."
 
-    return [torch.from_numpy(output) for output in outputs_np]
+    # Convert output numpy arrays to tensors with same dim_order as the output nodes
+    result = [
+        numpy_to_torch_tensor(output_array, node)
+        for output_array, node in zip(outputs_np, output_node.args[0])
+    ]
+
+    return result
 
 
 def get_target_board(compile_spec: ArmCompileSpec) -> str | None:
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index f240855cdf4..5fdd1c3d827 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -4,7 +4,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 
 def define_arm_tests():
-    # TODO Add more tests
+    # TODO [fbonly] Add more tests
     test_files = []
 
     # Passes
@@ -22,9 +22,11 @@ def define_arm_tests():
         "ops/test_mul.py",
         "ops/test_slice.py",
         "ops/test_sigmoid.py",
+        "ops/test_sub.py",
         "ops/test_tanh.py",
         "ops/test_view.py",
         "ops/test_cos.py",
+        "ops/test_to_copy.py",
     ]
 
     # Quantization
@@ -39,7 +41,7 @@ def define_arm_tests():
         "misc/test_bn_relu_folding_qat.py",
         "misc/test_custom_partition.py",
         "misc/test_debug_hook.py",
-        "misc/test_dim_order_guards.py",
+        # "misc/test_dim_order.py", (TODO - T238390249)
         "misc/test_outputs_order.py",
     ]
 
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 53c707cad28..b8e8aee4e3a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -155,17 +155,18 @@ test_pytest_ethosu_fvp() { # Same as test_pytest but also sometime verify using
 
 
 test_pytest_ops_vkml() { # Same as test_pytest but also sometime verify using VKML runtime
-    echo "${TEST_SUITE_NAME}: Run pytest with VKML"
+    echo "${TEST_SUITE_NAME}: Run pytest operator tests with VKML runtime"
 
     backends/arm/scripts/build_executorch.sh
     backends/arm/test/setup_testing_vkml.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models
+    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ \
+            --ignore=backends/arm/test/models -k _vgf_
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
 test_pytest_models_vkml() { # Same as test_pytest but also sometime verify VKML runtime
-    echo "${TEST_SUITE_NAME}: Run pytest with VKML"
+    echo "${TEST_SUITE_NAME}: Run pytest model tests with VKML runtime"
 
     backends/arm/scripts/build_executorch.sh
     backends/arm/test/setup_testing_vkml.sh
@@ -173,7 +174,7 @@ test_pytest_models_vkml() { # Same as test_pytest but also sometime verify VKML
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models
+    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k _vgf_
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -365,5 +366,20 @@ test_smaller_stories_llama() {
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
+test_memory_allocation() {
+    echo "${TEST_SUITE_NAME}: Test ethos-u memory allocation with run.sh"
+
+    mkdir -p arm_test/test_run
+    # Ethos-U85
+    echo "${TEST_SUITE_NAME}: Test target Ethos-U85"
+    examples/arm/run.sh --et_build_root=arm_test/test_run --target=ethos-u85-128 --model_name=examples/arm/example_modules/add.py &> arm_test/test_run/full.log
+    python3 backends/arm/test/test_memory_allocator_log.py --log arm_test/test_run/full.log \
+            --require "model_pte_program_size" "<= 3000 B" \
+            --require "method_allocator_planned" "<= 64 B" \
+            --require "method_allocator_loaded" "<= 1024 B" \
+            --require "method_allocator_input" "<= 4 B" \
+            --require "Total DRAM used" "<= 0.06 KiB"
+    echo "${TEST_SUITE_NAME}: PASS"
+}
 
 ${TEST_SUITE}
diff --git a/backends/arm/test/test_memory_allocator_log.py b/backends/arm/test/test_memory_allocator_log.py
new file mode 100644
index 00000000000..3853b60b7f6
--- /dev/null
+++ b/backends/arm/test/test_memory_allocator_log.py
@@ -0,0 +1,170 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Check log files for memory metrics and compare them against thresholds.
+
+Usage example:
+  python3 test_memory_allocator_log.py \
+    --log path/to/log.txt \
+    --require "Total SRAM used" "<= 310 KiB" \
+    --require "method_allocator_input" "<= 4 B"
+"""
+
+import argparse
+import re
+import sys
+from typing import List, Optional, Tuple
+
+
+def unit_factor(u: str) -> float:
+    if not u:
+        return 1.0
+    ul = u.strip().lower()
+    table = {
+        "b": 1,
+        "byte": 1,
+        "bytes": 1,
+        "kb": 1000,
+        "mb": 1000**2,
+        "gb": 1000**3,
+        "kib": 1024,
+        "mib": 1024**2,
+        "gib": 1024**3,
+    }
+    if ul in table:
+        return float(table[ul])
+    return 1.0
+
+
+def parse_value(text_num: str, text_unit: Optional[str]) -> float:
+    return float(text_num) * unit_factor(text_unit or "")
+
+
+def parse_cond(cond: str) -> Tuple[str, float, str]:
+    # Regexp explained. Example of things it will parse:
+    # "< 310 KiB", ">=10MB", "== 42", "!=3 bytes", "<=0.5 MiB"
+
+    # The regexp explained in detail:
+    # ^: anchor the match to the start and end of the string (no extra chars allowed).
+    # \s*: optional whitespace (spaces, tabs, etc.).
+    # (<=|>=|==|!=|<|>): capturing group 1. One of the comparison operators: <=, >=, ==, !=, <, >.
+    # \s*: optional whitespace.
+    # ([0-9]+(?:\.[0-9]+)?): capturing group 2. A number:
+    #   [0-9]+: one or more digits (the integer part).
+    #   (?:\.[0-9]+)?: optional non-capturing group for a fractional part like .25.
+    # \s*: optional whitespace between number and unit
+    # ([A-Za-z]+)?: capturing group 3, optional. A unit made of letters only (e.g., B, KB, KiB, MB, MiB). Case# insensitive by class choice.
+    # \s*: optional trailing whitespace.
+    m = re.match(
+        r"^\s*(<=|>=|==|!=|<|>)\s*([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?\s*$", cond
+    )
+    if not m:
+        raise ValueError(f"Invalid condition: {cond}")
+    op, num, unit = m.groups()
+    return op, float(num), (unit or "")
+
+
+def compare(a: float, b: float, op: str) -> bool:
+    return {
+        "<": a < b,
+        "<=": a <= b,
+        ">": a > b,
+        ">=": a >= b,
+        "==": abs(a - b) < 1e-9,
+        "!=": abs(a - b) >= 1e-9,
+    }[op]
+
+
+def find_metric_value(line: str, label: str) -> Tuple[Optional[str], Optional[str]]:
+    # Same regexp as parse_cond() but without the first group of matching comparison operators
+    # First go, search for the pattern but escape and ignore cases
+    # The regexp:
+    # ([0-9]+(?:\.[0-9]+)?) — capturing group 1: a decimal number
+    # [0-9]+ — one or more digits (integer part)
+    # (?:\.[0-9]+)? — optional fractional part like .25 (non-capturing)
+    # \s* — optional whitespace between number and unit
+    # ([A-Za-z]+)? — capturing group 2 (optional): a unit made only of letters (e.g., B, KB, KiB, MB)
+    m = re.search(
+        re.escape(label) + r".*?([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?",
+        line,
+        flags=re.IGNORECASE,
+    )
+    if m:
+        return m.group(1), m.group(2)
+    # Second go, same regexp as above but not caring about label. If
+    # no number was tied to a label be happy just salvaging it from
+    # the line
+    m = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([A-Za-z]+)?", line)
+    if m:
+        return m.group(1), m.group(2)
+    return None, None
+
+
+def first_line_with_label(lines: List[str], label: str) -> Optional[str]:
+    label_lc = label.lower()
+    return next((ln for ln in lines if label_lc in ln.lower()), None)
+
+
+def check_requirement(label: str, cond: str, lines: List[str]) -> Optional[str]:
+    op, thr_num, thr_unit = parse_cond(cond)
+    matched = first_line_with_label(lines, label)
+    if matched is None:
+        return f"{label}: not found in log"
+
+    num_str, unit_str = find_metric_value(matched, label)
+    if num_str is None:
+        return f"{label}: value not found on line: {matched.strip()}"
+
+    left_bytes = parse_value(num_str, unit_str)
+    right_bytes = parse_value(str(thr_num), thr_unit or (unit_str or ""))
+    ok = compare(left_bytes, right_bytes, op)
+
+    human_left = f"{num_str} {unit_str or 'B'}"
+    human_right = f"{thr_num:g} {thr_unit or (unit_str or 'B')}"
+    print(
+        f"[check] {label}: {human_left} {op} {human_right} -> {'OK' if ok else 'FAIL'}"
+    )
+
+    if ok:
+        return None
+    return f"{label}: {human_left} not {op} {human_right}"
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log", required=True, help="Path to log file")
+    parser.add_argument(
+        "--require",
+        action="append",
+        nargs=2,
+        metavar=("LABEL", "COND"),
+        default=[],
+        help="""Required label and condition consisting
+                         of a number and unit. Example: \"Total DRAM
+                         used\" \"<= 0.06 KiB\"""",
+    )
+    args = parser.parse_args()
+
+    with open(args.log, "r", encoding="utf-8", errors="ignore") as f:
+        lines = f.readlines()
+
+    failures: List[str] = []
+    for label, cond in args.require:
+        msg = check_requirement(label, cond, lines)
+        if msg:
+            failures.append(msg)
+
+    if failures:
+        print("Failures:")
+        for msg in failures:
+            print(" - " + msg)
+        return 1
+
+    print("All checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 8bf72827549..0cba8d987c0 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -28,17 +28,11 @@
 
 import torch.fx
 import torch.utils._pytree as pytree
-
 from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager
 
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-    VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test.runner_utils import (
     dbg_tosa_fb_to_json,
     get_output_quantization_params,
@@ -53,9 +47,13 @@
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
 from executorch.backends.arm.tosa.mapping import extract_tensor_meta
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
 
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.util._factory import (
+    create_partitioner,
+    create_quantizer,
+    parse_compile_spec,
+)
+from executorch.backends.arm.vgf import VgfCompileSpec
 
 from executorch.backends.test.harness.error_statistics import ErrorStatistics
 from executorch.backends.test.harness.stages import Stage, StageType
@@ -83,7 +81,6 @@
     _copy_module,
     _update_exported_program_graph_module,
 )
-
 from tabulate import tabulate
 
 from torch.export.graph_signature import ExportGraphSignature, InputSpec, OutputSpec
@@ -103,12 +100,6 @@ def _dump_lowered_modules_artifact(
         artifact.exported_program().graph_signature
     )
 
-    def get_output_format(lowered_module) -> str | None:
-        for spec in lowered_module.compile_specs:
-            if spec.key == "output_format":
-                return spec.value.decode()
-        return None
-
     for node in graph_module.graph.nodes:
         if node.op == "get_attr" and node.name.startswith("lowered_module_"):
             lowered_module = getattr(graph_module, node.name)
@@ -116,13 +107,13 @@ def get_output_format(lowered_module) -> str | None:
                 lowered_module, LoweredBackendModule
             ), f"Attribute {node.name} must be of type LoweredBackendModule."
 
-            output_format = get_output_format(lowered_module)
-            if output_format == "tosa":
+            compile_spec = parse_compile_spec(lowered_module.compile_specs)
+            if isinstance(compile_spec, TosaCompileSpec):
                 tosa_fb = lowered_module.processed_bytes
                 to_print = dbg_tosa_fb_to_json(tosa_fb)
                 to_print = pformat(to_print, compact=True, indent=1)
                 output += f"\nTOSA deserialized {node.name}: \n{to_print}\n"
-            elif output_format == EthosUCompileSpec.get_output_format():
+            elif isinstance(compile_spec, EthosUCompileSpec):
                 vela_cmd_stream = lowered_module.processed_bytes
                 output += f"\nVela command stream {node.name}: \n{vela_cmd_stream}\n"
             else:
@@ -284,13 +275,7 @@ def quantize(
         quantize_stage: Optional[tester.Quantize] = None,
     ):
         if quantize_stage is None:
-            quantizer = None
-            if isinstance(self.compile_spec, TosaCompileSpec):
-                quantizer = TOSAQuantizer(self.compile_spec)
-            elif isinstance(self.compile_spec, EthosUCompileSpec):
-                quantizer = EthosUQuantizer(self.compile_spec)
-            elif isinstance(self.compile_spec, VgfCompileSpec):
-                quantizer = VgfQuantizer(self.compile_spec)
+            quantizer = create_quantizer(self.compile_spec)
             quantize_stage = tester.Quantize(
                 quantizer,
                 get_symmetric_quantization_config(),
@@ -312,14 +297,7 @@ def to_edge(
 
     def partition(self, partition_stage: Optional[Partition] = None):
         if partition_stage is None:
-            if isinstance(self.compile_spec, TosaCompileSpec):
-                arm_partitioner = TOSAPartitioner(self.compile_spec)
-            elif isinstance(self.compile_spec, EthosUCompileSpec):
-                arm_partitioner = EthosUPartitioner(self.compile_spec)
-            elif isinstance(self.compile_spec, VgfCompileSpec):
-                arm_partitioner = VgfPartitioner(self.compile_spec)
-            else:
-                raise ValueError("compile spec doesn't target any Arm Partitioner")
+            arm_partitioner = create_partitioner(self.compile_spec)
             partition_stage = Partition(arm_partitioner)
         return super().partition(partition_stage)
 
@@ -329,7 +307,7 @@ def to_edge_transform_and_lower(
         partitioners: Optional[List[Partitioner]] = None,
         edge_compile_config: Optional[EdgeCompileConfig] = None,
         additional_checks: Optional[
-            List[Union[DontPartition | DontPartitionModule | DontPartitionName]]
+            List[DontPartition | DontPartitionModule | DontPartitionName]
         ] = None,
         transform_passes: Optional[
             Union[Sequence[PassType], Dict[str, Sequence[PassType]]]
@@ -343,20 +321,9 @@ def to_edge_transform_and_lower(
 
         if to_edge_and_lower_stage is None:
             if partitioners is None:
-                if isinstance(self.compile_spec, TosaCompileSpec):
-                    arm_partitioner = TOSAPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                elif isinstance(self.compile_spec, EthosUCompileSpec):
-                    arm_partitioner = EthosUPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                elif isinstance(self.compile_spec, VgfCompileSpec):
-                    arm_partitioner = VgfPartitioner(
-                        self.compile_spec, additional_checks
-                    )
-                else:
-                    raise ValueError("compile spec doesn't target any Arm Partitioner")
+                arm_partitioner = create_partitioner(
+                    self.compile_spec, additional_checks
+                )
                 partitioners = [arm_partitioner]
             to_edge_and_lower_stage = ToEdgeTransformAndLower(
                 partitioners,
@@ -463,6 +430,10 @@ def run_method_and_compare_outputs(
         for run_iteration in range(num_runs):
             reference_input = inputs if inputs else next(self.generate_random_inputs())
 
+            # Avoid issues with inplace operators
+            test_input = copy.deepcopy(reference_input)
+            original_input = copy.deepcopy(reference_input)
+
             input_shapes = [
                 generated_input.shape if hasattr(generated_input, "shape") else (1,)
                 for generated_input in reference_input
@@ -477,16 +448,16 @@ def run_method_and_compare_outputs(
                 # Run exported module directly
                 test_outputs, _ = pytree.tree_flatten(
                     self._calculate_reference_output(
-                        exported_program.module(), reference_input
+                        exported_program.module(), test_input
                     )
                 )
             else:
                 # Run lowered model with target
                 test_outputs, _ = pytree.tree_flatten(
-                    test_stage.run_artifact(reference_input)
+                    test_stage.run_artifact(test_input)
                 )
 
-            logger.info(f"\n      Input: {reference_input}")
+            logger.info(f"\n      Input: {original_input}")
             logger.info(f"\n Ref output: {reference_outputs}")
             logger.info(f"\nTest output: {test_outputs}")
 
@@ -743,22 +714,19 @@ def _get_tosa_operator_distribution(
     op_list = []
     id = 0
     while lowered_module := getattr(graph_module, f"lowered_module_{id}", None):
-        for spec in lowered_module.compile_specs:
-            if spec.key != "output_format":
-                continue
-            if spec.value == b"tosa":
-                tosa_fb = lowered_module.processed_bytes
-                tosa_json = dbg_tosa_fb_to_json(tosa_fb)
-                for region in tosa_json["regions"]:
-                    for block in region["blocks"]:
-                        op_list.extend(
-                            [operator["op"] for operator in block["operators"]]
-                        )
-                break
-            elif spec.value == EthosUCompileSpec.get_output_format().encode():
-                return "Can not get operator distribution for Vela command stream."
-            else:
-                return f"Unknown output format '{spec.value}'."
+        compile_spec = parse_compile_spec(lowered_module.compile_specs)
+        if isinstance(compile_spec, TosaCompileSpec):
+            tosa_fb = lowered_module.processed_bytes
+            tosa_json = dbg_tosa_fb_to_json(tosa_fb)
+            for region in tosa_json["regions"]:
+                for block in region["blocks"]:
+                    op_list.extend([operator["op"] for operator in block["operators"]])
+        elif isinstance(compile_spec, EthosUCompileSpec):
+            return "Can not get operator distribution for Vela command stream."
+        elif isinstance(compile_spec, VgfCompileSpec):
+            return "Can not get operator distribution for VGF."
+        else:
+            return f"Unknown output format '{compile_spec.get_output_format()}'."
         id += 1
     if id == 0:
         return "No delegate with name 'lowered_module_0 found in graph module."
diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py
index 123c1af44c3..54a8f08ee50 100644
--- a/backends/arm/test/tester/test_pipeline.py
+++ b/backends/arm/test/tester/test_pipeline.py
@@ -906,7 +906,7 @@ class VgfPipeline(BasePipelineMaker, Generic[T]):
        exir_ops: Exir dialect ops expected to be found in the graph after to_edge.
        if not using use_edge_to_transform_and_lower.
 
-       run_on_vulkan_runtime: Set to true to test VGF output on VKML runtime.
+       run_on_vulkan_runtime: Whether to test VGF output on VKML runtime.
 
        vgf_compiler_flags: Optional compiler flags.
 
@@ -922,7 +922,7 @@ def __init__(
         test_data: T,
         aten_op: str | List[str],
         exir_op: Optional[str | List[str]] = None,
-        run_on_vulkan_runtime: bool = False,
+        run_on_vulkan_runtime: bool = True,
         vgf_compiler_flags: Optional[str] = "",
         tosa_version: str = "TOSA-1.0+FP",
         symmetric_io_quantization: bool = False,
@@ -1018,3 +1018,16 @@ def __init__(
                 qtol=qtol,
                 inputs=self.test_data,
             )
+        self.run_on_vulkan_runtime = run_on_vulkan_runtime
+
+    # TODO: Remove once CI fully working
+    def run(self):
+        import pytest
+
+        if self.run_on_vulkan_runtime:
+            try:
+                super().run()
+            except FileNotFoundError as e:
+                pytest.skip(f"VKML executor_runner not found - not built - skip {e}")
+        else:
+            super().run()
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
index afae6f8163f..7a7ea2ca377 100644
--- a/backends/arm/tosa/backend.py
+++ b/backends/arm/tosa/backend.py
@@ -104,10 +104,15 @@ def _preprocess(  # noqa: C901
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
 
-        assert (
+        if not (
             tosa_spec.version.major == ts.TOSA_VERSION_MAJOR
             and tosa_spec.version.minor == ts.TOSA_VERSION_MINOR
-        ), f"TOSA serializer version ({ts.TOSA_VERSION_MAJOR}.{ts.TOSA_VERSION_MINOR}) doesn't match specification {tosa_spec}"
+        ):
+            raise RuntimeError(
+                f"TOSA serializer version "
+                f"({ts.TOSA_VERSION_MAJOR}.{ts.TOSA_VERSION_MINOR}) "
+                f"doesn't match specification {tosa_spec}"
+            )
 
         # TODO: Fix the need to lazily import this.
         from executorch.backends.arm._passes import ArmPassManager
@@ -201,8 +206,8 @@ def filter_tosa_compile_specs(
         hardware.
         """
 
-        new_compile_spec = TosaCompileSpec.__new__(TosaCompileSpec)
-        new_compile_spec._set_compile_specs(
-            compile_spec.tosa_spec, [], compile_spec.get_intermediate_path()
+        return (
+            TosaCompileSpec(compile_spec.tosa_spec)
+            .dump_intermediate_artifacts_to(compile_spec.get_intermediate_path())
+            .dump_debug_info(compile_spec.tosa_debug_mode)
         )
-        return new_compile_spec
diff --git a/backends/arm/tosa/dialect/__init__.py b/backends/arm/tosa/dialect/__init__.py
index 136f59beb62..897de70279f 100644
--- a/backends/arm/tosa/dialect/__init__.py
+++ b/backends/arm/tosa/dialect/__init__.py
@@ -4,7 +4,9 @@
 # LICENSE file in the root directory of this source tree.
 
 from executorch.backends.arm.tosa.dialect.ops import (  # noqa F401
+    matmul,
     rescale,
+    resize,
     table,
     transpose,
 )
diff --git a/backends/arm/tosa/dialect/ops/matmul.py b/backends/arm/tosa/dialect/ops/matmul.py
new file mode 100644
index 00000000000..1ba3821f674
--- /dev/null
+++ b/backends/arm/tosa/dialect/ops/matmul.py
@@ -0,0 +1,56 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm.tosa.dialect.lib import TosaValueError
+from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
+
+from executorch.backends.arm.tosa.specification import (
+    get_context_spec,
+    TosaSpecification,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_fake_tosa_op(
+    "MATMUL(Tensor input1, Tensor input2) -> Tensor",  # schema
+    (
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+    ),  # target TOSA specifications
+)
+def MATMUL(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+    tosa_spec = get_context_spec()
+    """Performs matrix multiplication on two input tensors.
+    Additionally validates TOSA constraints of a MATMUL op.
+    """
+    if x1.dtype != x2.dtype:
+        raise TosaValueError(
+            f"Input tensors must have the same dtype, got {x1.dtype} and {x2.dtype}",
+            op="MATMUL",
+        )
+    if x1.dtype in (torch.int8, torch.int16):
+        if not tosa_spec.support_integer():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support integers", op="MATMUL"
+            )
+        else:
+            dtype = torch.int32
+    elif x1.dtype in (torch.float16, torch.float32):
+        if not tosa_spec.support_float():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support float", op="MATMUL"
+            )
+        else:
+            # float16 supports float16 accumulation as well
+            dtype = torch.float32
+    else:
+        raise TosaValueError(
+            f"Input tensors must be of type int8, float16 or float32, got {x1.dtype}",
+            op="MATMUL",
+        )
+
+    aten_fake_tensor = exir_ops.edge.aten.bmm.default(x1, x2)
+
+    return torch.empty_like(aten_fake_tensor, dtype=dtype)
diff --git a/backends/arm/tosa/dialect/ops/resize.py b/backends/arm/tosa/dialect/ops/resize.py
new file mode 100644
index 00000000000..1f976d0f5e0
--- /dev/null
+++ b/backends/arm/tosa/dialect/ops/resize.py
@@ -0,0 +1,60 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Literal, Optional
+
+import torch
+from executorch.backends.arm.tosa.dialect.lib import TosaValueError
+from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op
+
+from executorch.backends.arm.tosa.specification import (
+    get_context_spec,
+    TosaSpecification,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+# Add kwarg instead?
+@register_fake_tosa_op(
+    "RESIZE(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors, *, str resize_mode) -> Tensor",  # schema
+    (
+        TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
+    ),  # target TOSA specifications
+)
+def RESIZE(
+    x: torch.Tensor,
+    output_size: list[int] | None = None,
+    align_corners: Optional[bool] = False,
+    scale_factors: list[float] | None = None,
+    *,
+    resize_mode: Literal["nearest", "bilinear"],
+) -> torch.Tensor:
+    tosa_spec = get_context_spec()
+
+    if resize_mode not in ("nearest", "bilinear"):
+        raise TosaValueError(f"Unsupported resize mode {resize_mode} for TOSA RESIZE")
+    if x.dtype == torch.int8:
+        if not tosa_spec.support_integer():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support integers", op="RESIZE"
+            )
+        bilinear = resize_mode == "bilinear"
+        output_dtype = torch.int32 if bilinear else torch.int8
+    elif x.dtype in (torch.float16, torch.float32):
+        if not tosa_spec.support_float():
+            raise TosaValueError(
+                f"TOSA spec {tosa_spec} doesn't support float", op="RESIZE"
+            )
+        output_dtype = x.dtype
+    else:
+        raise TosaValueError(f"Unsupported input dtype {x.dtype} for TOSA RESIZE")
+
+    # Does it matter which one to use for fake tracing?
+    fake_aten_tensor = exir_ops.edge.aten.upsample_nearest2d.vec(
+        x, output_size, scale_factors
+    )
+
+    return fake_aten_tensor.to(output_dtype)
diff --git a/backends/arm/tosa/dialect/ops/transpose.py b/backends/arm/tosa/dialect/ops/transpose.py
index 9c5aba05394..8d5bf8bac70 100644
--- a/backends/arm/tosa/dialect/ops/transpose.py
+++ b/backends/arm/tosa/dialect/ops/transpose.py
@@ -26,9 +26,9 @@ def TRANSPOSE(a, perms):
     # By utilizing an edge IR passthrough operator we can keep the edge program in
     # channels-first/contiguous and get the desired behavior in the TOSA lowering.
 
-    if len(perms) not in (4, 5):
+    if len(perms) not in (4, 5, 6):
         raise TosaValueError(
-            f"Only 4D and 5D tensors are supported, got {len(perms)}: {perms}",
+            f"Only 4D, 5D and 6D tensors are supported, got {len(perms)}: {perms}",
             op="TRANSPOSE",
         )
 
diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py
index 60ef98a37c0..64e4ae96e08 100644
--- a/backends/arm/tosa/mapping.py
+++ b/backends/arm/tosa/mapping.py
@@ -4,13 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide PyTorch-to-TOSA mapping helpers.
 
-#
-# PyTorch to Tosa mapping - simple mapping functions and multi-type extraction
-# of key information. These are used by the initial compile stage which captures
-# the standardised TOSA representation.
-#
+Use these utilities to translate PyTorch dtypes and FX node metadata into
+the TOSA serializer types and shapes used during initial compilation.
+
+"""
 
+from enum import Enum
 from typing import Any, Optional, Sequence
 
 import serializer.tosa_serializer as ts  # type: ignore
@@ -31,7 +32,36 @@
 )
 
 
+class TosaSpecialDtype(Enum):
+    """
+    Special TOSA data types that are not natively supported in PyTorch, to be
+    used in specific scenarios as a value in the key from meta_key().
+    """
+
+    INT48 = ts.DType.INT48
+
+    def get_tosa_dtype(self) -> ts.TosaDType.DType:
+        return self.value
+
+    @staticmethod
+    def meta_key() -> str:
+        return "tosa_special_dtype"
+
+
 def map_dtype(data_type: torch.dtype, tosa_spec: TosaSpecification) -> Any:
+    """Map a ``torch.dtype`` to a ``ts.DType``.
+
+    Args:
+        data_type (torch.dtype): PyTorch dtype to convert.
+        tosa_spec (TosaSpecification): Active spec (reserved for future checks).
+
+    Returns:
+        Any: Matching ``ts.DType`` enum value.
+
+    Raises:
+        ValueError: If the dtype is unsupported or unknown.
+
+    """
     if data_type in UNSUPPORTED_DTYPES:
         raise ValueError(f"Unsupported type: {data_type}")
 
@@ -57,7 +87,22 @@ def map_dtype(data_type: torch.dtype, tosa_spec: TosaSpecification) -> Any:
 # TODO: other types, can be
 # SymInt, FakeTensor, a List[Union[FakeTensor, SymInt]], or None
 def extract_tensor_meta(meta, tosa_spec: TosaSpecification):
-    assert meta.get("val") is not None
+    """Extract dtype, shape, and dimension order from FX metadata.
+
+    Args:
+        meta (dict): FX node ``meta`` containing a ``val`` FakeTensor (or tuple).
+        tosa_spec (TosaSpecification): Active TOSA spec for dtype mapping.
+
+    Returns:
+        tuple: ``(dtype, shape, dim_order)`` where ``dtype`` is ``ts.DType``,
+        ``shape`` is ``Tuple[int, ...]``, and ``dim_order`` is ``Tuple[int, ...]``.
+
+    Raises:
+        ValueError: If ``meta['val']`` is not a ``FakeTensor``.
+
+    """
+    if meta.get("val") is None:
+        raise ValueError("Expected node.meta['val'] to be set to a FakeTensor")
     val = meta["val"]
     if type(val) is tuple:
         # TODO: should use first concrete representation
@@ -77,23 +122,72 @@ def extract_tensor_meta(meta, tosa_spec: TosaSpecification):
     return (dtype, shape, dim_order)
 
 
-# Class to capture arguments and turn into tensor references for TOSA OPs
 class TosaArg:
+    """Capture and normalize TOSA operator arguments.
+
+    Use this to convert FX nodes, sequences, and numeric literals into a
+    consistent structure suitable for TOSA serialization.
+
+    Attributes:
+        name (str): Node name when argument is a ``torch.fx.Node``; empty otherwise.
+        dtype (ts.DType | None): Inferred dtype when available.
+        shape (tuple[int, ...] | None): Inferred shape when available.
+        dim_order (tuple[int, ...] | None): Dimension order, defaulting to ``range(len(shape))``.
+        special (list | None): Captured list when the argument is a sequence.
+        number (float | int | None): Captured numeric value when given.
+        tosa_spec (TosaSpecification): Active specification used for mapping.
+
+    """
+
     def __process_node(self, argument: torch.fx.Node):
+        """Parse a ``torch.fx.Node`` and populate tensor attributes.
+
+        Args:
+            argument (torch.fx.Node): FX node to inspect.
+
+        """
         self.name: str = argument.name
-        self.dtype, self.shape, self.dim_order = extract_tensor_meta(
+        output_dtype, self.shape, self.dim_order = extract_tensor_meta(
             argument.meta, self.tosa_spec
         )
 
+        # Handle special case of types not representable in torch (i.e. i48_t)
+        if special_type := argument.meta.get(TosaSpecialDtype.meta_key(), None):
+            output_dtype = special_type.get_tosa_dtype()
+
+        self.dtype = output_dtype
+
     def __process_list(self, argument):
+        """Capture a sequence argument as ``special``.
+
+        Args:
+            argument (Sequence): Sequence to store.
+
+        """
         self.special: list = list(argument)
 
     def __process_number(self, argument: float | int):
+        """Capture a numeric argument as ``number``.
+
+        Args:
+            argument (float | int): Numeric value.
+
+        """
         self.number: float | int = argument
 
     def __init__(
         self, argument: Any, tosa_spec: Optional[TosaSpecification] = None
     ) -> None:
+        """Initialize the argument wrapper and populate fields.
+
+        Args:
+            argument (Any): One of ``torch.fx.Node``, ``Sequence``, ``int``, ``float``, ``torch.dtype``, or ``None``.
+            tosa_spec (Optional[TosaSpecification]): Active specification; required.
+
+        Raises:
+            RuntimeError: If ``argument`` is of an unsupported type.
+
+        """
         if tosa_spec is None:
             raise ValueError("tosa_spec is None")
         elif not isinstance(tosa_spec, TosaSpecification):
@@ -127,6 +221,12 @@ def __init__(
         )
 
     def __repr__(self):
+        """Return a compact representation of populated attributes.
+
+        Returns:
+            str: Readable list of set attributes.
+
+        """
         attrs = []
         if hasattr(self, "name"):
             if self.name is not None:
diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py
index 3e512847109..6eb1dcbef72 100644
--- a/backends/arm/tosa/partitioner.py
+++ b/backends/arm/tosa/partitioner.py
@@ -4,6 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide a partitioner for delegating subgraphs to the TOSA backend.
+
+Implement logic to identify and tag regions of an ``ExportedProgram`` that can
+be delegated to the TOSA backend. Use this module to:
+
+- Partition graphs based on operator support and additional checks.
+- Prune trivial no-op partitions that would lower to empty TOSA graphs.
+- Tag constant data and report reasons for rejected nodes.
+"""
 
 import logging
 from typing import Callable, List, Optional, Sequence, Tuple
@@ -34,14 +43,46 @@
 
 
 def is_noop_clone(node: torch.fx.node.Node) -> bool:
+    """Return True if the node is a no-op ``dim_order_ops._clone_dim_order``.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``dim_order_ops._clone_dim_order.default``
+        in the Edge dialect; otherwise, False.
+
+    """
     return node.target == exir_ops.edge.dim_order_ops._clone_dim_order.default
 
 
 def is_noop_alias_copy(node: torch.fx.Node) -> bool:
+    """Return True if the node is a no-op ``aten.alias_copy``.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``aten.alias_copy.default``; otherwise,
+        False.
+
+    """
     return node.target == exir_ops.edge.aten.alias_copy.default
 
 
 def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool:
+    """Return True if node is a no-op ``dim_order_ops._to_dim_order_copy``.
+
+    Consider the op a no-op when the output dtype equals the input's dtype.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if it targets ``_to_dim_order_copy.default`` and preserves
+        dtype; otherwise, False.
+
+    """
     if node.target != exir_ops.edge.dim_order_ops._to_dim_order_copy.default:
         return False
     else:
@@ -49,6 +90,19 @@ def is_noop_to_dim_order_copy(node: torch.fx.node.Node) -> bool:
 
 
 def is_noop_expand(node: torch.fx.node.Node) -> bool:
+    """Return True if the node is an ``expand_copy`` with all-ones multiples.
+
+    This corresponds to a semantic no-op, since expanding by 1 along every
+    dimension leaves the tensor unchanged.
+
+    Args:
+        node (torch.fx.Node): FX node to inspect.
+
+    Returns:
+        bool: True if the node targets ``aten.expand_copy.default`` and all
+        computed multiples are 1; otherwise, False.
+
+    """
     if node.target != exir_ops.edge.aten.expand_copy.default:
         return False
     else:
@@ -57,11 +111,30 @@ def is_noop_expand(node: torch.fx.node.Node) -> bool:
 
 
 class TOSAPartitioner(Partitioner):
+    """Partition an exported program into TOSA-delegable subgraphs.
+
+    Construct this partitioner for compile specs targeting TOSA. The partition
+    algorithm uses capability checks and optional additional operator-support
+    rules to tag nodes with a delegation tag per subgraph.
+    """
+
     def __init__(
         self,
         compile_spec: TosaCompileSpec,
         additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
     ) -> None:
+        """Initialize the TOSAPartitioner.
+
+        Args:
+            compile_spec (TosaCompileSpec): Parsed compile specifications for
+                TOSA containing the TOSA spec and original list.
+            additional_checks (Optional[Sequence[OperatorSupportBase]]): Extra
+                operator-support checks to apply when partitioning.
+
+        Raises:
+            RuntimeError: If the provided compile spec does not target TOSA.
+
+        """
         self.delegation_spec = DelegationSpec(
             TOSABackend.__name__, compile_spec.to_list()
         )
@@ -70,9 +143,22 @@ def __init__(
         self.tosa_spec = compile_spec.tosa_spec
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # noqa
-        # Run the CapabilityBasedPartitioner to return the largest possible
-        # subgraphs containing the nodes with the tags
+        """Partition the program and tag TOSA-compatible subgraphs.
+
+        Run the FX capability-based partitioner to propose subgraphs, then
+        refine tags by removing boundary-only quantize/dequantize nodes and by
+        rejecting partitions that would lower to no-ops. Emit a detailed report
+        of rejected nodes and their reasons.
+
+        Args:
+            exported_program (ExportedProgram): Program to analyze and
+                partition.
+
+        Returns:
+            PartitionResult: The input program with nodes tagged for delegation
+            and a mapping of partition tags to delegation specs.
 
+        """
         logger.info("TOSAPartitioner::partition")
         partition_tags: dict[str, DelegationSpec] = {}
 
@@ -92,6 +178,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:  # no
         partition_list = capability_partitioner.propose_partitions()
 
         def reject_partition(reason: str, partition, tag) -> None:
+            """Remove a proposed partition and record the rejection reason.
+
+            Args:
+                reason (str): Human-readable explanation for rejection.
+                partition (object): Proposed partition object from the
+                    capability partitioner.
+                tag (str): Delegation tag associated with the partition.
+
+            """
             for node in partition.nodes:
                 if "delegation_tag" in node.meta:
                     del node.meta["delegation_tag"]
@@ -105,6 +200,16 @@ def reject_partition(reason: str, partition, tag) -> None:
             tag = f"tag{partition.id}"
 
             def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
+                """Return True if the node currently belongs to the partition ``tag``.
+
+                Args:
+                    node (torch.fx.Node): FX node to check.
+                    tag (str): Delegation tag identifying the partition.
+
+                Returns:
+                    bool: True if the node carries the matching delegation tag.
+
+                """
                 return (
                     "delegation_tag" in node.meta and node.meta["delegation_tag"] == tag
                 )
@@ -113,8 +218,8 @@ def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
                 node.meta["delegation_tag"] = tag
                 partition_tags[tag] = self.delegation_spec
 
-            # De-tag outmost q-nodes upwards and dq-nodes downwards.
-            # De-tag if at least one input/ output is not part of partition.
+            # De-tag outermost q-nodes upwards and dq-nodes downwards.
+            # De-tag if at least one input/output is not part of the partition.
             for node in exported_program.graph_module.graph.nodes:
                 if not is_partitioned(node):
                     continue
@@ -175,15 +280,41 @@ def ops_to_not_decompose(
         self,
         ep: ExportedProgram,
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        """Return operators and a filter that should not be decomposed.
+
+        Provide a base set of ops to preserve as-is and a predicate that keeps
+        certain activations whole when surrounded by quantize/dequantize ops in
+        a quantized graph. This helps downstream TOSA lowering and delegation.
+
+        Args:
+            ep (ExportedProgram): Program used to infer target-specific policy.
+
+        Returns:
+            Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+                A list of op overloads to keep intact, and an optional filter
+                function that returns True when an op should not be decomposed.
+
+        """
         ops_to_not_decompose_if_quant_op = [
             torch.ops.aten.hardsigmoid.default,
             torch.ops.aten.hardswish.default,
         ]
 
         def filter_fn(node: torch.fx.Node) -> bool:
-            # This function filters for operators to not decompose where:
-            #   - It's target is in ops_to_not_decompose_if_quant_op list.
-            #   - All it's inputs/outputs are quantize operators.
+            """Return True to keep selected ops intact inside quantized regions.
+
+            The predicate holds when the target is in
+            ``ops_to_not_decompose_if_quant_op`` and all inputs/outputs are
+            quantize/dequantize ops, indicating a quantized activation that
+            should not be decomposed.
+
+            Args:
+                node (torch.fx.Node): FX node to evaluate.
+
+            Returns:
+                bool: True to keep the op intact; otherwise, False.
+
+            """
             dq = torch.ops.quantized_decomposed.dequantize_per_tensor.default
             q = torch.ops.quantized_decomposed.quantize_per_tensor.default
 
@@ -204,7 +335,7 @@ def filter_fn(node: torch.fx.Node) -> bool:
 
                 return should_not_decompose
 
-            # Be default, do not decompose the operator
+            # By default, do not decompose the operator
             return True
 
         ops_to_not_decompose = [
diff --git a/backends/arm/tosa/quant_utils.py b/backends/arm/tosa/quant_utils.py
index 86e8e5bad8b..562c77e30da 100644
--- a/backends/arm/tosa/quant_utils.py
+++ b/backends/arm/tosa/quant_utils.py
@@ -20,6 +20,7 @@
 
 from executorch.backends.arm.tosa.mapping import TosaArg
 from torch.fx import Node
+
 from tosa.RoundingMode import RoundingMode  # type: ignore
 
 
@@ -76,6 +77,59 @@ def insert_rescale_ops_to_int32_maxscale(
     return [rescaled_lhs, rescaled_rhs], back_scale
 
 
+def insert_rescale_ops_int16_to_int32_maxscale(
+    tosa_graph: Any, inputs: list[TosaArg], node: Node, tosa_spec=None
+) -> tuple[list[Any], float]:
+    """For ADD and SUB with int16 inputs, we rescale to int32 using a different common scale(2*max(left scale,right scale))
+    compared to all the other cases. We multiply the left and right scales by 1<<12 giving us extra precision
+    for the computation without overflowing.
+
+    Returns a list of the rescaled nodes and the scale factor used,
+    needed by insert_rescale_op_to_int16.
+    """
+
+    if len(inputs) > 2:
+        raise ValueError("More than two inputs not supported")
+
+    tensors = inputs.copy()
+    # Reshape tensor according to TOSA dim order
+    for tensor in tensors:
+        dim_order = tensor.dim_order
+        tensor.shape = [tensor.shape[i] for i in dim_order]
+
+    input_qparams = get_input_qparams(node)
+    lhs_qparams, rhs_qparams = input_qparams.values()
+    lhs_scale = lhs_qparams.get_scale_per_tensor()
+    rhs_scale = rhs_qparams.get_scale_per_tensor()
+    # Common scale for the two numbers
+    max_scale_2x = 2 * max(lhs_scale, rhs_scale)
+    SHIFT_INT16 = 12
+    # We are adding two int16 numbers. If the zero point is non-null, the result will be in the range [-131070;131070], therefore we need 18 bits for the result.
+    # We have a 32-bit accumulator, so we can shift to the left by 12 bits and not overflow. In reality, because we divide by the 2*max(lhs_scale,rhs_scale)
+    # we are shifting to the left by 11.
+    lhs_factor = (1 << SHIFT_INT16) * lhs_scale / max_scale_2x
+    rhs_factor = (1 << SHIFT_INT16) * rhs_scale / max_scale_2x
+    rescaled_lhs = build_rescale_to_int32(
+        tosa_graph,
+        tensors[0],
+        lhs_qparams.get_zp_per_tensor(),
+        lhs_factor,
+        tosa_spec=tosa_spec,
+    )
+    rescaled_rhs = build_rescale_to_int32(
+        tosa_graph,
+        tensors[1],
+        rhs_qparams.get_zp_per_tensor(),
+        rhs_factor,
+        tosa_spec=tosa_spec,
+    )
+    out_qparam = get_output_qparams(node)[0]
+    out_scale = out_qparam.get_scale_per_tensor()
+    back_scale = max_scale_2x / (out_scale * (1 << SHIFT_INT16))
+
+    return [rescaled_lhs, rescaled_rhs], back_scale
+
+
 def insert_rescale_ops_to_int32(
     tosa_graph: Any,
     inputs: list[TosaArg],
@@ -245,7 +299,9 @@ def compute_multiplier_and_shift(
         const_2_power_15_or_31 = 1 << offset
         shifted_mantissa = round(mantissa * const_2_power_15_or_31)
 
-        assert shifted_mantissa <= const_2_power_15_or_31
+        assert (
+            shifted_mantissa <= const_2_power_15_or_31
+        ), f"Mantissa {shifted_mantissa} exceeds limit {const_2_power_15_or_31}"
 
         if shifted_mantissa == const_2_power_15_or_31:
             shifted_mantissa = shifted_mantissa // 2
@@ -255,13 +311,19 @@ def compute_multiplier_and_shift(
         shift = offset - shift
 
         # INT32_MAX, 2^31 - 1
-        assert shifted_mantissa <= (const_2_power_15_or_31 - 1)
+        assert shifted_mantissa <= (const_2_power_15_or_31 - 1), (
+            f"Mantissa {shifted_mantissa} exceeds signed max "
+            f"{const_2_power_15_or_31 - 1}"
+        )
 
         multiplier = shifted_mantissa
 
         if shift > 62:
             multiplier = multiplier >> min(31, shift - 62)
             shift = 62
+
+        assert multiplier >= 0, "Multiplier should be non-negative"
+        assert shift >= 2 and shift <= 62, "Shift should be in range [2, 62]"
         multipliers.append(multiplier)
         shifts.append(shift)
     return multipliers, shifts
@@ -313,10 +375,11 @@ def build_rescale(
     per_channel=False,
 ):
     import serializer.tosa_serializer as ts  # type: ignore
+
     import tosa.Op as TosaOp  # type: ignore
 
-    scaleWidth = 32
-    is_scale32 = True
+    scaleWidth = 16 if input_node.dtype == ts.DType.INT48 else 32
+    is_scale32 = False if input_node.dtype == ts.DType.INT48 else True
     multipliers, shifts = compute_multiplier_and_shift(scale, scaleWidth)
     rescale_inputs = create_const_ops_for_rescale(
         tosa_fb,
diff --git a/backends/arm/tosa/specification.py b/backends/arm/tosa/specification.py
index b372cd5a636..3edf27760b5 100644
--- a/backends/arm/tosa/specification.py
+++ b/backends/arm/tosa/specification.py
@@ -4,12 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+"""Provide TOSA specification parsing and context utilities.
 
-#
-# Main implementation of AoT flow to partition and preprocess for Arm target
-# backends. Converts via TOSA as an intermediate form supported by AoT and
-# JIT compiler flows.
-#
+Use these helpers to parse and validate TOSA profile/extension strings and to
+manage a lowering-time context for the active specification.
+
+"""
 
 import contextvars
 import re
@@ -19,36 +19,39 @@
 
 
 class TosaSpecification:
-    """
-    This class implements a representation of TOSA specification
-    (https://www.mlplatform.org/tosa/tosa_spec.html) with a version, a profile
-    (with extension) and a level (8k).
-    For 1.00 releases the profile is INT or FP, and the extensions are for
-        INT: int16, int4, var, cf
-        FP: bf16, fp8e4m3, fp8e5m2, fft, var, cf
+    """Represent a TOSA specification.
 
-    The TOSA specification is encoded in the string represenatation
-        TOSA-major.minor.patch+profile[+level][+extensions]
+    A specification includes a semantic version, one or more profiles, and
+    optional extensions and levels (for example ``8k``).
+    The encoded form follows ``TOSA-<major>.<minor>.<patch>+<PROFILE>[+<LEVEL>][+<EXT>...]``.
+    Profiles use uppercase (for example ``INT``, ``FP``); levels and extensions
+    use lowercase.
+
+    Attributes:
+        version (Version): Parsed TOSA semantic version.
+        is_U55_subset (bool): True if the ``u55`` subset is requested.
 
-    Profiles are uppercase letters and extensions and level is lowercase.
     """
 
     version: Version
     is_U55_subset: bool
 
     def support_integer(self) -> bool:
-        """
-        Returns true if any integer operations are supported for the specification.
-        """
+        """Return True if integer operations are supported."""
         raise NotImplementedError
 
     def support_float(self) -> bool:
-        """
-        Returns true if any float operations are supported for the specification.
-        """
+        """Return True if floating-point operations are supported."""
         raise NotImplementedError
 
     def __init__(self, version: Version, extras: List[str]):
+        """Initialize the base specification.
+
+        Args:
+            version (Version): Parsed TOSA semantic version.
+            extras (List[str]): Remaining tokens such as profiles, levels, and extensions.
+
+        """
         self.version = version
 
         self.is_U55_subset = "u55" in extras
@@ -57,11 +60,20 @@ def __init__(self, version: Version, extras: List[str]):
 
     @staticmethod
     def create_from_string(repr: str) -> "TosaSpecification":
-        """
-        Creates a TOSA specification class from a string representation:
-        TOSA-1.00.0+INT+FP+int4+cf
-        """
+        """Create a specification from a standard string format.
+
+        Example: ``TOSA-1.00.0+INT+FP+int4+cf``.
 
+        Args:
+            repr (str): Standard representation string.
+
+        Returns:
+            TosaSpecification: Parsed specification instance.
+
+        Raises:
+            ValueError: If the representation is malformed or version is unsupported.
+
+        """
         pattern = r"^(TOSA)-([\d.]+)\+(.+)$"
         match = re.match(pattern, repr)
         if match:
@@ -80,6 +92,18 @@ def create_from_string(repr: str) -> "TosaSpecification":
 
 
 class Tosa_1_00(TosaSpecification):
+    """Provide TOSA 1.00 profile and extension semantics.
+
+    This variant validates profiles (``INT``, ``FP``), the optional ``8k`` level,
+    and allowed extensions based on the selected profiles.
+
+    Attributes:
+        profiles (List[str]): Selected profiles, e.g., ``["INT"]`` or ``["INT", "FP"]``.
+        level_8k (bool): True if the ``8k`` level is enabled.
+        extensions (List[str]): Enabled extensions valid for the chosen profiles.
+
+    """
+
     profiles: List[str]
     level_8k: bool
     extensions: List[str]
@@ -91,6 +115,16 @@ class Tosa_1_00(TosaSpecification):
     }
 
     def __init__(self, version: Version, extras: List[str]):
+        """Initialize the 1.00 specification and validate extras.
+
+        Args:
+            version (Version): Semantic version (major=1, minor=0).
+            extras (List[str]): Tokens including profiles, level, and extensions.
+
+        Raises:
+            ValueError: If no/too many profiles are provided or extensions are invalid.
+
+        """
         super().__init__(version, extras)
 
         # Check that we have at least one profile in the extensions list
@@ -129,12 +163,20 @@ def __init__(self, version: Version, extras: List[str]):
         self.extensions = extras
 
     def _get_profiles_string(self) -> str:
+        """Return the ``+``-joined profile segment (e.g., ``+INT+FP``)."""
         return "".join(["+" + p for p in self.profiles])
 
     def _get_extensions_string(self) -> str:
+        """Return the ``+``-joined extensions segment (e.g., ``+int4+cf``)."""
         return "".join(["+" + e for e in self.extensions])
 
     def __repr__(self):
+        """Return the standard specification string format.
+
+        Returns:
+            str: Standard form like ``TOSA-1.00.0+INT+8k+int4``.
+
+        """
         extensions = self._get_extensions_string()
         if self.level_8k:
             extensions += "+8k"
@@ -143,9 +185,24 @@ def __repr__(self):
         return f"TOSA-{self.version}{self._get_profiles_string()}{extensions}"
 
     def __hash__(self) -> int:
+        """Return a stable hash for use in sets and dict keys.
+
+        Returns:
+            int: Hash value derived from version and profiles.
+
+        """
         return hash(str(self.version) + self._get_profiles_string())
 
     def __eq__(self, other: object) -> bool:
+        """Return True if another instance represents the same spec.
+
+        Args:
+            other (object): Object to compare.
+
+        Returns:
+            bool: True if versions and profiles match.
+
+        """
         if isinstance(other, Tosa_1_00):
             return (self.version == other.version) and (
                 self._get_profiles_string() == other._get_profiles_string()
@@ -153,12 +210,23 @@ def __eq__(self, other: object) -> bool:
         return False
 
     def support_integer(self):
+        """Return True if the ``INT`` profile is present."""
         return "INT" in self.profiles
 
     def support_float(self):
+        """Return True if the ``FP`` profile is present."""
         return "FP" in self.profiles
 
     def support_extension(self, extension: str) -> bool:
+        """Return True if an extension is supported and enabled.
+
+        Args:
+            extension (str): Extension name (for example ``int4``, ``bf16``).
+
+        Returns:
+            bool: True if the extension is valid for the active profiles and selected.
+
+        """
         for p in self.profiles:
             if extension in self.valid_extensions[p] and extension in self.extensions:
                 return True
@@ -167,30 +235,63 @@ def support_extension(self, extension: str) -> bool:
 
 
 class TosaLoweringContext:
-    """
-    A context manager to handle the TOSA specific aspects of the lowering process.
-    For now it only handles the TOSA specification context, but it can be extended
-    to include other policies or configurations.
+    """Manage the TOSA specification context for lowering.
+
+    For now, only the active ``TosaSpecification`` is tracked, but this can be
+    extended to carry additional lowering policies or configuration.
+
+    Attributes:
+        tosa_spec_var (contextvars.ContextVar): Context variable storing the active spec.
+        spec (TosaSpecification): Specification passed to the context manager.
+
     """
 
     # Define a context variable for the spec
     tosa_spec_var: contextvars.ContextVar = contextvars.ContextVar("tosa_spec")
 
     def __init__(self, spec: TosaSpecification):
+        """Initialize the lowering context with a specification.
+
+        Args:
+            spec (TosaSpecification): Active specification to put into context.
+
+        """
         self.spec = spec
 
     def __enter__(self):
+        """Set the context variable and return self.
+
+        Returns:
+            TosaLoweringContext: This context manager instance.
+
+        """
         # Set the spec in the context variable and store the token for later reset
         self.token = TosaLoweringContext.tosa_spec_var.set(self.spec)
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
+        """Reset the context variable to its previous state.
+
+        Args:
+            exc_type (type | None): Exception type, if any.
+            exc_value (BaseException | None): Exception instance, if any.
+            traceback (TracebackType | None): Traceback, if any.
+
+        """
         # Reset the context variable to its previous state
         TosaLoweringContext.tosa_spec_var.reset(self.token)
 
 
-# A helper function to retrieve the current spec anywhere in your code
 def get_context_spec() -> TosaSpecification:
+    """Get the current ``TosaSpecification`` from the lowering context.
+
+    Returns:
+        TosaSpecification: Active specification retrieved from the context var.
+
+    Raises:
+        RuntimeError: If called outside a ``TosaLoweringContext``.
+
+    """
     try:
         return TosaLoweringContext.tosa_spec_var.get()
     except LookupError:
diff --git a/backends/arm/util/_factory.py b/backends/arm/util/_factory.py
new file mode 100644
index 00000000000..23d8215fc9b
--- /dev/null
+++ b/backends/arm/util/_factory.py
@@ -0,0 +1,59 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
+from executorch.backends.arm.quantizer import (
+    EthosUQuantizer,
+    TOSAQuantizer,
+    VgfQuantizer,
+)
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+def parse_compile_spec(compile_specs: list[CompileSpec]) -> ArmCompileSpec:
+    output_format = None
+    for spec in compile_specs:
+        if spec.key == "output_format":
+            output_format = spec.value.decode()
+            break
+    else:
+        raise ValueError("Compile spec without output format.")
+    if output_format == TosaCompileSpec.get_output_format():
+        return TosaCompileSpec.from_list(compile_specs)
+    if output_format == EthosUCompileSpec.get_output_format():
+        return EthosUCompileSpec.from_list(compile_specs)
+    if output_format == VgfCompileSpec.get_output_format():
+        return VgfCompileSpec.from_list(compile_specs)
+    raise ValueError(f"Unknown output format {output_format}")
+
+
+def create_partitioner(
+    compile_spec: ArmCompileSpec,
+    additional_checks: list[OperatorSupportBase] | None = None,
+):
+    if isinstance(compile_spec, TosaCompileSpec):
+        return TOSAPartitioner(compile_spec, additional_checks)
+    elif isinstance(compile_spec, EthosUCompileSpec):
+        return EthosUPartitioner(compile_spec, additional_checks)
+    elif isinstance(compile_spec, VgfCompileSpec):
+        return VgfPartitioner(compile_spec, additional_checks)
+    else:
+        raise ValueError("compile spec doesn't target any Arm Partitioner")
+
+
+def create_quantizer(compile_spec: ArmCompileSpec):
+    if isinstance(compile_spec, TosaCompileSpec):
+        return TOSAQuantizer(compile_spec)
+    elif isinstance(compile_spec, EthosUCompileSpec):
+        return EthosUQuantizer(compile_spec)
+    elif isinstance(compile_spec, VgfCompileSpec):
+        return VgfQuantizer(compile_spec)
+    else:
+        raise ValueError("compile spec doesn't target any Arm Quantizer")
diff --git a/backends/arm/util/arm_model_evaluator.py b/backends/arm/util/arm_model_evaluator.py
index a3dcbdc5c6f..8c36128cea8 100644
--- a/backends/arm/util/arm_model_evaluator.py
+++ b/backends/arm/util/arm_model_evaluator.py
@@ -1,11 +1,11 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
+import json
 import logging
 import os
 import random
@@ -14,7 +14,7 @@
 
 from collections import defaultdict
 from pathlib import Path
-from typing import Any, Optional, Tuple
+from typing import Any, cast, Optional, Tuple
 
 import torch
 from torch.nn.modules import Module
@@ -29,7 +29,139 @@
 logger.setLevel(logging.INFO)
 
 
+# ImageNet 224x224 transforms (Resize->CenterCrop->ToTensor->Normalize)
+# If future models require different preprocessing, extend this helper accordingly.
+def _get_imagenet_224_transforms():
+    """Return standard ImageNet 224x224 preprocessing transforms."""
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]),
+        ]
+    )
+
+
+def _build_calibration_loader(
+    dataset: datasets.ImageFolder, max_items: int
+) -> DataLoader:
+    """Return a DataLoader over a deterministic, shuffled subset of size <= max_items.
+
+    Shuffles with seed: ARM_EVAL_CALIB_SEED (int) or default 1337; then selects first k and
+    sorts indices to keep enumeration order stable while content depends on seed.
+    """
+    k = min(max_items, len(dataset))
+    seed_env = os.getenv("ARM_EVAL_CALIB_SEED")
+    default_seed = 1337
+    if seed_env is not None:
+        try:
+            seed = int(seed_env)
+        except ValueError:
+            logger.warning(
+                "ARM_EVAL_CALIB_SEED is not an int (%s); using default seed %d",
+                seed_env,
+                default_seed,
+            )
+            seed = default_seed
+    else:
+        seed = default_seed
+    rng = random.Random(seed)
+    indices = list(range(len(dataset)))
+    rng.shuffle(indices)
+    selected = sorted(indices[:k])
+    return torch.utils.data.DataLoader(
+        torch.utils.data.Subset(dataset, selected), batch_size=1, shuffle=False
+    )
+
+
+def _load_imagenet_folder(directory: str) -> datasets.ImageFolder:
+    """Shared helper to load an ImageNet-layout folder.
+
+    Raises FileNotFoundError for a missing directory early to aid debugging.
+    """
+    directory_path = Path(directory)
+    if not directory_path.exists():
+        raise FileNotFoundError(f"Directory: {directory} does not exist.")
+    transform = _get_imagenet_224_transforms()
+    return datasets.ImageFolder(directory_path, transform=transform)
+
+
 class GenericModelEvaluator:
+    """Base evaluator computing quantization error metrics and optional compression ratio.
+
+    Subclasses can extend: provide calibration (get_calibrator) and override evaluate()
+    to add domain specific metrics (e.g. top-1 / top-5 accuracy).
+    """
+
+    @staticmethod
+    def evaluate_topk(
+        model: Module,
+        dataset: datasets.ImageFolder,
+        batch_size: int,
+        topk: int = 5,
+        log_every: int = 50,
+    ) -> Tuple[float, float]:
+        """Evaluate model top-1 / top-k accuracy.
+
+        Args:
+            model: Torch module (should be in eval() mode prior to call).
+            dataset: ImageFolder style dataset.
+            batch_size: Batch size for evaluation.
+            topk: Maximum k for accuracy (default 5).
+            log_every: Log running accuracy every N batches.
+        Returns:
+            (top1_accuracy, topk_accuracy)
+        """
+        # Some exported / quantized models (torchao PT2E) disallow direct eval()/train().
+        # Try to switch to eval mode, but degrade gracefully if unsupported.
+        try:
+            model.eval()
+        except NotImplementedError:
+            # Attempt to enable train/eval overrides if torchao helper is present.
+            try:
+                from torchao.quantization.pt2e.utils import (  # type: ignore
+                    allow_exported_model_train_eval,
+                )
+
+                allow_exported_model_train_eval(model)
+                try:
+                    model.eval()
+                except Exception:
+                    logger.debug(
+                        "Model eval still not supported after allow_exported_model_train_eval; proceeding without explicit eval()."
+                    )
+            except Exception:
+                logger.debug(
+                    "Model eval() unsupported and torchao allow_exported_model_train_eval not available; proceeding."
+                )
+        loaded_dataset = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+        top1_correct = 0
+        topk_correct = 0
+        total = 0
+        with torch.inference_mode():  # disable autograd + some backend optimizations
+            for i, (image, target) in enumerate(loaded_dataset):
+                prediction = model(image)
+                topk_indices = torch.topk(prediction, k=topk, dim=1).indices
+                # target reshaped for broadcasting
+                target_view = target.view(-1, 1)
+                top1_correct += (topk_indices[:, :1] == target_view).sum().item()
+                topk_correct += (topk_indices == target_view).sum().item()
+                batch_sz = image.size(0)
+                total += batch_sz
+                if (i + 1) % log_every == 0 or total == len(dataset):
+                    logger.info(
+                        "Eval progress: %d / %d  top1=%.4f top%d=%.4f",
+                        total,
+                        len(dataset),
+                        top1_correct / total,
+                        topk,
+                        topk_correct / total,
+                    )
+        top1_accuracy = top1_correct / len(dataset)
+        topk_accuracy = topk_correct / len(dataset)
+        return top1_accuracy, topk_accuracy
+
     REQUIRES_CONFIG = False
 
     def __init__(
@@ -52,12 +184,13 @@ def __init__(
             self.tosa_output_path = ""
 
     def get_model_error(self) -> defaultdict:
-        """
-        Returns a dict containing the following metrics between the outputs of the FP32 and INT8 model:
-        - Maximum error
-        - Maximum absolute error
-        - Maximum percentage error
-        - Mean absolute error
+        """Return per-output quantization error statistics.
+
+        Metrics (lists per output tensor):
+            max_error
+            max_absolute_error
+            max_percentage_error (safe-divided; zero fp32 elements -> 0%)
+            mean_absolute_error
         """
         fp32_outputs, _ = tree_flatten(self.fp32_model(*self.example_input))
         int8_outputs, _ = tree_flatten(self.int8_model(*self.example_input))
@@ -66,7 +199,12 @@ def get_model_error(self) -> defaultdict:
 
         for fp32_output, int8_output in zip(fp32_outputs, int8_outputs):
             difference = fp32_output - int8_output
-            percentage_error = torch.div(difference, fp32_output) * 100
+            # Avoid divide by zero: elements where fp32 == 0 produce 0% contribution
+            percentage_error = torch.where(
+                fp32_output != 0,
+                difference / fp32_output * 100,
+                torch.zeros_like(difference),
+            )
             model_error_dict["max_error"].append(torch.max(difference).item())
             model_error_dict["max_absolute_error"].append(
                 torch.max(torch.abs(difference)).item()
@@ -131,69 +269,186 @@ def __init__(
 
     @staticmethod
     def __load_dataset(directory: str) -> datasets.ImageFolder:
-        directory_path = Path(directory)
-        if not directory_path.exists():
-            raise FileNotFoundError(f"Directory: {directory} does not exist.")
-
-        transform = transforms.Compose(
-            [
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=[0.484, 0.454, 0.403], std=[0.225, 0.220, 0.220]
-                ),
-            ]
-        )
-        return datasets.ImageFolder(directory_path, transform=transform)
+        return _load_imagenet_folder(directory)
 
     @staticmethod
     def get_calibrator(training_dataset_path: str) -> DataLoader:
         dataset = MobileNetV2Evaluator.__load_dataset(training_dataset_path)
-        rand_indices = random.sample(range(len(dataset)), k=1000)
+        return _build_calibration_loader(dataset, 1000)
 
-        # Return a subset of the dataset to be used for calibration
-        return torch.utils.data.DataLoader(
-            torch.utils.data.Subset(dataset, rand_indices),
-            batch_size=1,
-            shuffle=False,
+    @classmethod
+    def from_config(
+        cls,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        config: dict[str, Any],
+    ) -> "MobileNetV2Evaluator":
+        """Factory constructing evaluator from a config dict.
+
+        Expected keys: batch_size, validation_dataset_path
+        """
+        return cls(
+            model_name,
+            fp32_model,
+            int8_model,
+            example_input,
+            tosa_output_path,
+            batch_size=config["batch_size"],
+            validation_dataset_path=config["validation_dataset_path"],
         )
 
-    def __evaluate_mobilenet(self) -> Tuple[float, float]:
+    def evaluate(self) -> dict[str, Any]:
+        # Load dataset and compute top-1 / top-5
         dataset = MobileNetV2Evaluator.__load_dataset(self.__validation_set_path)
-        loaded_dataset = DataLoader(
-            dataset,
-            batch_size=self.__batch_size,
-            shuffle=False,
+        top1_correct, top5_correct = GenericModelEvaluator.evaluate_topk(
+            self.int8_model, dataset, self.__batch_size, topk=5
         )
+        output = super().evaluate()
 
-        top1_correct = 0
-        top5_correct = 0
+        output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
+        return output
 
-        for i, (image, target) in enumerate(loaded_dataset):
-            prediction = self.int8_model(image)
-            top1_prediction = torch.topk(prediction, k=1, dim=1).indices
-            top5_prediction = torch.topk(prediction, k=5, dim=1).indices
 
-            top1_correct += (top1_prediction == target.view(-1, 1)).sum().item()
-            top5_correct += (top5_prediction == target.view(-1, 1)).sum().item()
+class DeiTTinyEvaluator(GenericModelEvaluator):
+    REQUIRES_CONFIG = True
 
-            logger.info("Iteration: {}".format((i + 1) * self.__batch_size))
-            logger.info(
-                "Top 1: {}".format(top1_correct / ((i + 1) * self.__batch_size))
-            )
-            logger.info(
-                "Top 5: {}".format(top5_correct / ((i + 1) * self.__batch_size))
-            )
+    def __init__(
+        self,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        batch_size: int,
+        validation_dataset_path: str,
+    ) -> None:
+        super().__init__(
+            model_name, fp32_model, int8_model, example_input, tosa_output_path
+        )
+        self.__batch_size = batch_size
+        self.__validation_set_path = validation_dataset_path
 
-        top1_accuracy = top1_correct / len(dataset)
-        top5_accuracy = top5_correct / len(dataset)
+    @staticmethod
+    def __load_dataset(directory: str) -> datasets.ImageFolder:
+        return _load_imagenet_folder(directory)
+
+    @staticmethod
+    def get_calibrator(training_dataset_path: str) -> DataLoader:
+        dataset = DeiTTinyEvaluator.__load_dataset(training_dataset_path)
+        return _build_calibration_loader(dataset, 1000)
+
+    @classmethod
+    def from_config(
+        cls,
+        model_name: str,
+        fp32_model: Module,
+        int8_model: Module,
+        example_input: Tuple[torch.Tensor],
+        tosa_output_path: str | None,
+        config: dict[str, Any],
+    ) -> "DeiTTinyEvaluator":
+        """Factory constructing evaluator from a config dict.
 
-        return top1_accuracy, top5_accuracy
+        Expected keys: batch_size, validation_dataset_path
+        """
+        return cls(
+            model_name,
+            fp32_model,
+            int8_model,
+            example_input,
+            tosa_output_path,
+            batch_size=config["batch_size"],
+            validation_dataset_path=config["validation_dataset_path"],
+        )
 
     def evaluate(self) -> dict[str, Any]:
-        top1_correct, top5_correct = self.__evaluate_mobilenet()
+        # Load dataset and compute top-1 / top-5
+        dataset = DeiTTinyEvaluator.__load_dataset(self.__validation_set_path)
+        top1, top5 = GenericModelEvaluator.evaluate_topk(
+            self.int8_model, dataset, self.__batch_size, topk=5
+        )
         output = super().evaluate()
-
-        output["metrics"]["accuracy"] = {"top-1": top1_correct, "top-5": top5_correct}
+        output["metrics"]["accuracy"] = {"top-1": top1, "top-5": top5}
         return output
+
+
+evaluators: dict[str, type[GenericModelEvaluator]] = {
+    "generic": GenericModelEvaluator,
+    "mv2": MobileNetV2Evaluator,
+    "deit_tiny": DeiTTinyEvaluator,
+}
+
+
+def evaluator_calibration_data(
+    evaluator_name: str,
+    evaluator_config: str | None,
+):
+    evaluator = evaluators[evaluator_name]
+
+    if hasattr(evaluator, "get_calibrator"):
+        assert evaluator_config is not None
+
+        config_path = Path(evaluator_config)
+        with config_path.open() as f:
+            config = json.load(f)
+
+        if evaluator is MobileNetV2Evaluator:
+            return evaluator.get_calibrator(
+                training_dataset_path=config["training_dataset_path"]
+            )
+        if evaluator is DeiTTinyEvaluator:
+            return evaluator.get_calibrator(
+                training_dataset_path=config["training_dataset_path"]
+            )
+        else:
+            raise RuntimeError(f"Unknown evaluator: {evaluator_name}")
+
+
+def evaluate_model(
+    model_name: str,
+    intermediates: str,
+    model_fp32: torch.nn.Module,
+    model_int8: torch.nn.Module,
+    example_inputs: Tuple[torch.Tensor],
+    evaluator_name: str,
+    evaluator_config: str | None,
+) -> None:
+    evaluator = evaluators[evaluator_name]
+
+    intermediates_path = Path(intermediates)
+    tosa_paths = list(intermediates_path.glob("*.tosa"))
+
+    if evaluator.REQUIRES_CONFIG:
+        assert evaluator_config is not None
+        config_path = Path(evaluator_config)
+        with config_path.open() as f:
+            config = json.load(f)
+
+        # Prefer a subclass provided from_config if available.
+        if hasattr(evaluator, "from_config"):
+            factory = cast(Any, evaluator.from_config)  # type: ignore[attr-defined]
+            init_evaluator = factory(
+                model_name,
+                model_fp32,
+                model_int8,
+                example_inputs,
+                str(tosa_paths[0]),
+                config,
+            )
+        else:
+            raise RuntimeError(
+                f"Evaluator {evaluator_name} requires config but does not implement from_config()"
+            )
+    else:
+        init_evaluator = evaluator(
+            model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0])
+        )
+
+    quant_metrics = init_evaluator.evaluate()
+    output_json_path = intermediates_path / "quant_metrics.json"
+
+    with output_json_path.open("w") as json_file:
+        json.dump(quant_metrics, json_file)
diff --git a/backends/backends.bzl b/backends/backends.bzl
index 5ca30a83b54..42aed059f22 100644
--- a/backends/backends.bzl
+++ b/backends/backends.bzl
@@ -6,7 +6,6 @@ def get_all_cpu_backend_targets():
     """
     return [
         "//executorch/backends/xnnpack:xnnpack_backend",
-        "//executorch/backends/fb/qnnpack:qnnpack_backend",
     ]
 
 def get_all_cpu_aot_and_backend_targets():
@@ -18,6 +17,4 @@ def get_all_cpu_aot_and_backend_targets():
     return [
         "//executorch/backends/xnnpack:xnnpack_preprocess",
         "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
-        "//executorch/backends/fb/qnnpack:qnnpack_preprocess",
-        "//executorch/backends/fb/qnnpack/partition:qnnpack_partitioner",
     ] + get_all_cpu_backend_targets()
diff --git a/backends/cadence/CMakeLists.txt b/backends/cadence/CMakeLists.txt
index 47183bed21d..271b4806614 100644
--- a/backends/cadence/CMakeLists.txt
+++ b/backends/cadence/CMakeLists.txt
@@ -88,8 +88,11 @@ elseif(EXECUTORCH_FUSION_G3_OPT)
     ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib
     ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
   )
+elseif(EXECUTORCH_VISION_OPT)
+  set(TARGET_DIR vision)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 else()
-  set(TARGET_DIR reference)
+  set(TARGET_DIR generic)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
 endif()
 
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 0ec09bf4f9e..94ab6de0e29 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -130,6 +130,7 @@ runtime.python_library(
     deps = [
         "fbcode//caffe2:torch",
         "fbcode//executorch/exir:scalar_type",
+        "fbcode//executorch/kernels/quantized:custom_ops_generated_lib",
     ],
 )
 
@@ -143,18 +144,13 @@ executorch_generated_lib(
     visibility = ["PUBLIC"],
     deps = [
         "//executorch/backends/cadence/generic/kernels:cadence_kernels",
-        # Individual operator targets instead of combined cadence_generic_ops
-        "//executorch/backends/cadence/generic/operators:op_add",
-        "//executorch/backends/cadence/generic/operators:op_embedding",
-        "//executorch/backends/cadence/generic/operators:op_full",
         "//executorch/backends/cadence/generic/operators:op_requantize_out",
-        "//executorch/backends/cadence/generic/operators:op_view_copy",
         "//executorch/backends/cadence/generic/operators:im2row_out",
         "//executorch/backends/cadence/generic/operators:dequantize_per_tensor",
         "//executorch/backends/cadence/generic/operators:quantize_per_tensor",
         "//executorch/backends/cadence/generic/operators:quantized_add_out",
-        "//executorch/backends/cadence/generic/operators:quantized_conv_nchw_out",
-        "//executorch/backends/cadence/generic/operators:quantized_conv_nhwc_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv2d_nchw_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv2d_nhwc_out",
         "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out",
         "//executorch/backends/cadence/generic/operators:quantized_layer_norm",
         "//executorch/backends/cadence/generic/operators:quantized_linear_out",
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
index 6c497d5bec4..765ddcd581d 100644
--- a/backends/cadence/aot/compiler.py
+++ b/backends/cadence/aot/compiler.py
@@ -24,6 +24,7 @@
 from executorch.backends.cadence.aot.quantizer.quantizer import (
     CadenceDefaultQuantizer,
     CadenceQuantizer,
+    CadenceW8A32MixedQuantizer,
 )
 from executorch.backends.cadence.aot.utils import (
     get_default_memory_config,
@@ -59,6 +60,7 @@ def trace(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
     dump_graphs: bool = False,
+    quantizer: Optional[CadenceQuantizer] = None,
 ) -> ExportedProgram:
     """
     Trace the model with export and return an ExportedProgram.
@@ -73,6 +75,12 @@ def trace(
         torch.ops.aten.rms_norm.default,
     ]
 
+    if isinstance(quantizer, CadenceW8A32MixedQuantizer):
+        ops_to_keep += [
+            torch.ops.aten.gru.input,
+            torch.ops.aten.gru.data,
+        ]
+
     program = trace_fn(
         model, inputs, is_qat=False, strict=True, ops_to_keep=ops_to_keep
     )
@@ -99,7 +107,7 @@ def prepare_pt2(
     Returns a GraphModule with the prepared model.
     """
 
-    traced_program = trace(model, inputs, dump_graphs=dump_graphs)
+    traced_program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer)
     prepared_program = prepare_traced_pt2(
         traced_program, quantizer, dump_graphs=dump_graphs
     )
@@ -184,7 +192,7 @@ def get_fake_quant_model(
     # Make the model inference mode by calling model.eval()
     model.eval()
 
-    program = trace(model, inputs, dump_graphs=dump_graphs)
+    program = trace(model, inputs, dump_graphs=dump_graphs, quantizer=quantizer)
 
     if dump_graphs:
         logging.info("Graph after trace:")
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index 1c626887649..d8024c0245a 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -184,21 +184,81 @@
     - arg_meta: null
       kernel_name: impl::generic::quantize_per_tensor_out
 
+- func: cadence::quantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantize_per_tensor_asym8s_out
+
+- func: cadence::quantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantize_per_tensor_asym8u_out
+
+- func: cadence::quantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantize_per_tensor_asym16s_out
+
+- func: cadence::quantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantize_per_tensor_asym16u_out
+
+- func: cadence::quantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantize_per_tensor_asym32s_out
+
 - func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::dequantize_per_tensor_out
 
-- func: cadence::quantized_conv_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::dequantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::dequantize_per_tensor_asym8s_out
+
+- func: cadence::dequantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::dequantize_per_tensor_asym8u_out
+
+- func: cadence::dequantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::dequantize_per_tensor_asym16s_out
+
+- func: cadence::dequantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::dequantize_per_tensor_asym16u_out
+
+- func: cadence::dequantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::dequantize_per_tensor_asym32s_out
+
+- func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_out
 
-- func: cadence::quantized_conv_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_out
 
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
@@ -289,95 +349,95 @@
     - arg_meta: null
       kernel_name: impl::generic::im2row_per_tensor_out
 
-- func: cadence::quantized_conv_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::generic::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::generic::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::generic::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out
 
 - func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index a5f3102d600..3bdbb33d59b 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -284,111 +284,171 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantize_per_tensor_out
 
+- func: cadence::quantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantize_per_tensor_asym8s_out
+
+- func: cadence::quantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantize_per_tensor_asym8u_out
+
+- func: cadence::quantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantize_per_tensor_asym16s_out
+
+- func: cadence::quantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantize_per_tensor_asym16s_out
+
+- func: cadence::quantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantize_per_tensor_asym32s_out
+
 - func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::dequantize_per_tensor_out
 
-- func: cadence::quantized_conv_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::dequantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_out
+      kernel_name: impl::HiFi::dequantize_per_tensor_asym8s_out
 
-- func: cadence::quantized_conv_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::dequantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_out
+      kernel_name: impl::HiFi::dequantize_per_tensor_asym8u_out
 
-- func: cadence::quantized_conv_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::dequantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_per_tensor_out
+      kernel_name: impl::HiFi::dequantize_per_tensor_asym16s_out
 
-- func: cadence::quantized_conv_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::dequantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_per_tensor_out
+      kernel_name: impl::HiFi::dequantize_per_tensor_asym16u_out
 
-- func: cadence::quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::dequantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::dequantize_per_tensor_asym16s_out
 
-- func: cadence::quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_out
 
-- func: cadence::quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_out
 
-- func: cadence::quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out
 
-- func: cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
-- func: cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out
+
+- func: cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out
 
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
@@ -488,3 +548,18 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out
+
+- func: cadence::quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_linear_out
+
+- func: cadence::quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_conv_out
+
+- func: cadence::quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_w8a32_gru_out
diff --git a/backends/cadence/aot/functions_vision.yaml b/backends/cadence/aot/functions_vision.yaml
new file mode 100644
index 00000000000..8d9cdd16105
--- /dev/null
+++ b/backends/cadence/aot/functions_vision.yaml
@@ -0,0 +1,265 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This yaml file contains operators that are also defined by the ATen library.
+# For lean mode:
+#   - Codegen'd target `executorch_generated_lib` will be reading all the information
+#     from this file, including operator schema and kernel metadata.
+#   - Selective build target `codegen:executorch_defined_ops` now is selecting all the
+#     operators in this file, by dumping all the op names into `selected_operators.yaml`.
+#
+# See the README.md file in executorch/kernels/portable for a description of the syntax used
+# by this file.
+
+
+# aten ops
+- op: _to_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::to_copy_out
+
+- op: _softmax.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::_softmax_out
+
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::add_out
+
+- op: bmm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::bmm_out
+
+- op: cat.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::cat_out
+
+- op: clone.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::clone_out
+
+- op: div.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::div_out
+
+- op: div.out_mode
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::div_out_mode
+
+- op: embedding.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::embedding_out
+
+- op: empty.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::empty_out
+
+- op: expand_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::expand_copy_out
+
+- op: full.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::full_out
+
+- op: gelu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::gelu_out
+
+- op: hardtanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::hardtanh_out
+
+- op: max_pool2d_with_indices.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::max_pool2d_with_indices_out
+
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mean_dim_out
+
+- op: mul.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mul_out
+
+- op: mul.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mul_scalar_out
+
+- op: permute_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::permute_copy_out
+
+- op: rsqrt.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::rsqrt_out
+
+- op: sigmoid.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::sigmoid_out
+
+- op: slice_copy.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::slice_copy_Tensor_out
+
+- op: split_with_sizes_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::split_with_sizes_copy_out
+
+- op: sub.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::sub_out
+
+- op: view_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::view_copy_out
+
+- op: where.self_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::where_out
+
+- op: transpose_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::transpose_copy_int_out
+
+- op: eq.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::eq_scalar_out
+
+- op: logical_not.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::logical_not_out
+
+- op: any.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::any_out
+
+- op: native_group_norm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::native_group_norm_out
+
+- op: sum.IntList_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::sum_dim_out
+
+- op: select_copy.int_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::select_copy_int_out
+
+# custom ops
+- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantize_per_tensor_out
+
+- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::dequantize_per_tensor_out
+
+- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_conv_out
+
+- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_layer_norm_per_tensor_out
+
+- func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_linear_out
+
+- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_relu_out
+
+- func: cadence::quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_relu_per_tensor_out
+
+- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_matmul_out
+
+- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_linear_per_tensor_out
+
+- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::im2row_out
+
+- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::im2row_per_tensor_out
+
+- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_conv_per_tensor_out
+
+- func: cadence::quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_fully_connected_out
+
+- func: cadence::quantized_fully_connected.per_tensor_out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::quantized_fully_connected_per_tensor_out
+
+- func: cadence::requantize.out(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::requantize_out
+
+- func: cadence::requantize.per_tensor_out(Tensor input, float in_scale, int in_zero_point, float out_scale, int out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::vision::native::requantize_per_tensor_out
diff --git a/backends/cadence/aot/graph_builder.py b/backends/cadence/aot/graph_builder.py
index 2cfd7900e8e..f609ba55472 100644
--- a/backends/cadence/aot/graph_builder.py
+++ b/backends/cadence/aot/graph_builder.py
@@ -44,12 +44,12 @@ class GraphBuilder(ExportPass):
         gm = builder.get_graph_module()
     """
 
-    def __init__(self) -> None:
+    def __init__(self, fake_tensor_mode: Optional[FakeTensorMode] = None) -> None:
         self.exporter = ExportPass()
         self.tracer: ExportPass.ExportTracer = self.ExportTracer(
             self, torch.fx.graph.CodeGen()
         )
-        self.fake_tensor_mode = FakeTensorMode(
+        self.fake_tensor_mode: FakeTensorMode = fake_tensor_mode or FakeTensorMode(
             allow_fallback_kernels=False,
             allow_non_fake_inputs=True,
         )
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index bd2bf32834d..f827488adfb 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -28,12 +28,78 @@
     "quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantize_per_tensor_asym8s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "quantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantize_per_tensor_asym8u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "quantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantize_per_tensor_asym16s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "quantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantize_per_tensor_asym16u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "quantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantize_per_tensor_asym32s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "quantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
 lib.define(
     "dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
 )
 lib.define(
     "dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "dequantize_per_tensor_asym8s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "dequantize_per_tensor_asym8s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "dequantize_per_tensor_asym8u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "dequantize_per_tensor_asym8u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "dequantize_per_tensor_asym16s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "dequantize_per_tensor_asym16s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "dequantize_per_tensor_asym16u(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "dequantize_per_tensor_asym16u.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+lib.define(
+    "dequantize_per_tensor_asym32s(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> (Tensor Z)"
+)
+lib.define(
+    "dequantize_per_tensor_asym32s.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 lib.define(
     "quantized_layer_norm(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)"
@@ -86,28 +152,28 @@
 )
 
 lib.define(
-    "quantized_conv_nhwc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
     "quantized_matmul(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)"
@@ -122,100 +188,100 @@
     "quantized_matmul_asym8sxasym8s_asym8s.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
     "quantized_matmul_asym8uxasym8u_asym8u(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)"
@@ -254,7 +320,7 @@
     "float out_scale, int out_zero_point) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
 lib.define(
@@ -263,7 +329,7 @@
     "Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False) -> (Tensor out)"
 )
 lib.define(
-    "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, "
+    "avg_pool2d(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], bool ceil_mode=False, "
     "bool count_include_pad=True, int? divisor_override=None, Tensor? in_zero_point=None, bool channel_last=False) -> (Tensor out)"
 )
 lib.define(
@@ -448,7 +514,7 @@
     "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
+    "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
     "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
@@ -459,7 +525,7 @@
     "Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, "
+    "avg_pool2d.out(Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=[], "
     "bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, "
     "Tensor? in_zero_point=None, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
@@ -498,6 +564,29 @@
     "_softmax_f32_f32.out(Tensor self, int dim, bool? half_to_float, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_w8a32_linear(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor"
+)
+lib.define(
+    "quantized_w8a32_linear.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantized_w8a32_conv(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale) -> Tensor"
+)
+lib.define(
+    "quantized_w8a32_conv.out(Tensor input, Tensor weight, float w_scale, Tensor bias, float b_scale, *, Tensor(a!) output) -> Tensor(a!)"
+)
+
+lib.define(
+    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale) -> Tensor"
+)
+
+lib.define(
+    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+
 # Custom ops with aten namespace. Need to specify the lib var as FRAGMENT type as aten library is already defined
 aten_lib = Library("aten", "FRAGMENT")
 aten_lib.define(
@@ -554,6 +643,66 @@ def quantize_per_tensor_meta(
     return input.new_empty(input.size(), dtype=dtype)
 
 
+@register_fake("cadence::quantize_per_tensor_asym8s")
+def quantize_per_tensor_asym8s_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=dtype)
+
+
+@register_fake("cadence::quantize_per_tensor_asym8u")
+def quantize_per_tensor_asym8u_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=dtype)
+
+
+@register_fake("cadence::quantize_per_tensor_asym16s")
+def quantize_per_tensor_asym16s_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=dtype)
+
+
+@register_fake("cadence::quantize_per_tensor_asym16u")
+def quantize_per_tensor_asym16u_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=dtype)
+
+
+@register_fake("cadence::quantize_per_tensor_asym32s")
+def quantize_per_tensor_asym32s_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=dtype)
+
+
 @register_fake("cadence::dequantize_per_tensor")
 def dequantize_per_tensor_meta(
     input: torch.Tensor,
@@ -566,6 +715,66 @@ def dequantize_per_tensor_meta(
     return input.new_empty(input.size(), dtype=torch.float)
 
 
+@register_fake("cadence::dequantize_per_tensor_asym8s")
+def dequantize_per_tensor_asym8s_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=torch.float)
+
+
+@register_fake("cadence::dequantize_per_tensor_asym8u")
+def dequantize_per_tensor_asym8u_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=torch.float)
+
+
+@register_fake("cadence::dequantize_per_tensor_asym16s")
+def dequantize_per_tensor_asym16s_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=torch.float)
+
+
+@register_fake("cadence::dequantize_per_tensor_asym16u")
+def dequantize_per_tensor_asym16u_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=torch.float)
+
+
+@register_fake("cadence::dequantize_per_tensor_asym32s")
+def dequantize_per_tensor_asym32s_meta(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=torch.float)
+
+
 @register_fake("cadence::quantized_add")
 def quantized_add_meta(
     X: torch.Tensor,
@@ -717,8 +926,8 @@ def quantized_linear_asym8uxasym8u_asym8u_per_tensor_meta(
     return src.new_empty(out_size, dtype=src.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc")
-def quantized_conv_nhwc_meta(
+@register_fake("cadence::quantized_conv2d_nhwc")
+def quantized_conv2d_nhwc_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -761,8 +970,8 @@ def quantized_conv_nhwc_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw")
-def quantized_conv_nchw_meta(
+@register_fake("cadence::quantized_conv2d_nchw")
+def quantized_conv2d_nchw_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -805,8 +1014,8 @@ def quantized_conv_nchw_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw.per_tensor")
-def quantized_conv_nchw_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nchw.per_tensor")
+def quantized_conv2d_nchw_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -849,8 +1058,8 @@ def quantized_conv_nchw_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc.per_tensor")
-def quantized_conv_nhwc_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nhwc.per_tensor")
+def quantized_conv2d_nhwc_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -893,8 +1102,8 @@ def quantized_conv_nhwc_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -942,8 +1151,8 @@ def quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -991,8 +1200,8 @@ def quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1040,8 +1249,8 @@ def quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1089,8 +1298,8 @@ def quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1138,8 +1347,8 @@ def quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1187,8 +1396,8 @@ def quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1236,8 +1445,8 @@ def quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake("cadence::quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1285,8 +1494,10 @@ def quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake(
+    "cadence::quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor"
+)
+def quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1334,8 +1545,10 @@ def quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake(
+    "cadence::quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor"
+)
+def quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1383,8 +1596,10 @@ def quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake(
+    "cadence::quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor"
+)
+def quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -1432,8 +1647,10 @@ def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake(
+    "cadence::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor"
+)
+def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -2050,10 +2267,10 @@ def avg_pool2d_meta(
     kernel_size: Tuple[int],
     stride: Tuple[int],
     padding: Tuple[int],
-    ceil_mode: bool,
-    count_include_pad: Optional[bool] = True,
+    ceil_mode: bool = False,
+    count_include_pad: bool = True,
     divisor_override: Optional[int] = None,
-    in_zero_point: Optional[int] = None,
+    in_zero_point: Optional[torch.Tensor] = None,
     channel_last: bool = False,
 ) -> torch.Tensor:
     # Use torch native meta kernels when operator semantics are similar
@@ -2108,6 +2325,28 @@ def transposed_im2row_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_embedding_byte")
+def quantized_embedding_byte_meta(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    assert not pruned_weights
+    assert len(weight.shape) == 2
+    assert 1 <= len(weight_scales.shape) <= 2
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[-1]
+        assert weight.shape[1] % num_groups == 0
+
+    if weight_zero_points is not None:
+        assert weight_zero_points.shape == weight_scales.shape
+
+    assert 1 <= len(indices.shape) <= 2
+    return torch.empty(*indices.shape, weight.shape[1], dtype=torch.float32)
+
+
 @register_fake("cadence::where_Scalar")
 def where_Scalar_meta(
     condition: torch.Tensor,
@@ -2190,8 +2429,8 @@ def roi_align_box_processor_meta(
     return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8)
 
 
-@register_fake("cadence::quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake("cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -2226,8 +2465,8 @@ def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake("cadence::quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -2262,8 +2501,8 @@ def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor")
-def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta(
+@register_fake("cadence::quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor")
+def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -2298,8 +2537,8 @@ def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
-@register_fake("cadence::quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor")
-def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_meta(
+@register_fake("cadence::quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor")
+def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_meta(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -2368,3 +2607,67 @@ def quantized_softmax_per_tensor_meta(
     out_zero_point: int,
 ) -> torch.Tensor:
     return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_linear")
+def quantized_w8a32_linear_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [leading_dims, in_dim]
+    # weight comes in shape [in_dim, out_dim]
+    # output comes in empty with shape [leading_dims, out_dim]
+    src_shape = list(src.shape)
+    weight_shape = weight.shape
+    assert len(weight_shape) == 2
+    assert src_shape[-1] == weight_shape[-1]
+    src_shape[-1] = weight_shape[0]
+    return src.new_empty(src_shape, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_conv")
+def quantized_w8a32_conv_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    w_scale: float,
+    bias: torch.Tensor,
+    b_scale: float,
+) -> torch.Tensor:
+    # src comes in shape [batch, in_channel, in_length]
+    # weight comes in shape [out_ch, in_ch, kernel_dim]
+    # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
+    assert len(src.shape) == 3
+
+    kernel_size, out_channels, in_channels = weight.shape
+    assert in_channels == src.shape[-1]
+
+    # Compute the output tensor size
+    output_size = get_conv1d_output_size(
+        src.permute(0, 2, 1).shape,
+        out_channels,
+        stride=1,
+        padding=0,
+        dilation=1,
+        kernel_size=kernel_size,
+        channel_last=False,
+    )
+    return src.new_empty(output_size, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_w8a32_gru")
+def quantized_w8a32_gru_meta(
+    inputs: torch.Tensor,
+    hidden: torch.Tensor,
+    weights_inputs: torch.Tensor,
+    w_i_scale: float,
+    weights_hidden: torch.Tensor,
+    w_h_scale: float,
+    bias_inputs: torch.Tensor,
+    b_i_scale: float,
+    bias_hidden: torch.Tensor,
+    b_h_scale: float,
+) -> torch.Tensor:
+    return inputs.new_empty((2, hidden.shape[-1]), dtype=inputs.dtype)
diff --git a/backends/cadence/aot/program_builder.py b/backends/cadence/aot/program_builder.py
index 862ba4e977c..46d730b68ff 100644
--- a/backends/cadence/aot/program_builder.py
+++ b/backends/cadence/aot/program_builder.py
@@ -12,6 +12,7 @@
 from torch import Tensor
 from torch._export.verifier import Verifier
 from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.export import ExportedProgram
 from torch.export.exported_program import ModuleCallEntry, ModuleCallSignature
 from torch.export.graph_signature import (
@@ -37,6 +38,7 @@ def __init__(
         self,
         mode: Optional[IrMode] = None,
         _core_aten_ops_exception_list: Optional[list[OpOverload]] = None,
+        fake_tensor_mode: Optional[FakeTensorMode] = None,
     ) -> None:
         self.input_specs: list[InputSpec] = []
         self.output_specs: list[OutputSpec] = []
@@ -46,7 +48,7 @@ def __init__(
         self._core_aten_ops_exception_list: list[OpOverload] = (
             _core_aten_ops_exception_list or []
         )
-        super().__init__()
+        super().__init__(fake_tensor_mode=fake_tensor_mode)
 
     def insert_input_spec(
         self, target: str, input_kind: InputKind, value: Tensor
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
index ed14574a8c8..2fa0f794e3c 100644
--- a/backends/cadence/aot/quantizer/fusion_pass.py
+++ b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -24,6 +24,9 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32ConvPattern,
+    MixedW8A32GruPattern,
+    MixedW8A32LinearPattern,
     ReluPattern0,
     ReluPattern1,
     SoftmaxPattern,
@@ -390,6 +393,29 @@ def get_args_and_kwargs_relu(
     return args, kwargs
 
 
+def get_args_and_kwargs_mixed_w8a32_linear(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    w_scale_ = dequants_weights[0].args[1]
+    b_scale_ = dequants_biases[0].args[1]
+
+    args = (
+        other_inputs[0],
+        weights_inputs[0],
+        w_scale_,
+        bias_inputs[0],
+        b_scale_,
+    )
+    kwargs = {}
+
+    return args, kwargs
+
+
 def get_args_and_kwargs_softmax(
     graph_module: GraphModule,
     inputs_inputs: List[fx.Node],
@@ -454,6 +480,87 @@ def get_args_and_kwargs_softmax(
         out_zero_point_tensor,
     )
     kwargs = {}
+
+    return args, kwargs
+
+
+def get_args_and_kwargs_mixed_w8a32_conv(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Stride, padding, dilation, groups not supported yet
+    if len(op_node.args) > 3:
+        assert op_node.args[3] == [1]  # Stride
+    if len(op_node.args) > 4:
+        assert op_node.args[4] == [0]  # Padding
+    if len(op_node.args) > 5:
+        assert op_node.args[5] == [1]  # Dilation
+    if len(op_node.args) > 6:
+        assert op_node.args[6] == 1  # Groups
+
+    assert len(dequants_weights) == 1
+    assert len(dequants_biases) == 1
+    W_scale_ = dequants_weights[0].args[1]
+    B_scale_ = dequants_biases[0].args[1]
+
+    transposed_inputs = graph_module.graph.call_function(
+        torch.ops.aten.permute.default,
+        (other_inputs[0], [0, 2, 1]),  # NCL -> NLC
+    )
+    transposed_weights = graph_module.graph.call_function(
+        torch.ops.aten.permute.default,
+        (weights_inputs[0], [2, 0, 1]),  # NCL -> NLC
+    )
+
+    args = (
+        transposed_inputs,
+        transposed_weights,
+        W_scale_,
+        bias_inputs[0],
+        B_scale_,
+    )
+    kwargs = {}
+
+    return args, kwargs
+
+
+def get_args_and_kwargs_mixed_w8a32_gru(
+    graph_module: GraphModule,
+    other_inputs: List[fx.Node],
+    weights_inputs: List[fx.Node],
+    dequants_weights: List[fx.Node],
+    bias_inputs: List[fx.Node],
+    dequants_biases: List[fx.Node],
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Stride, padding, dilation, groups not supported yet
+
+    assert len(dequants_weights) == 2
+    assert len(dequants_biases) == 2
+    w_i_scale = dequants_weights[0].args[1]
+    w_h_scale = dequants_weights[1].args[1]
+    b_i_scale = dequants_biases[0].args[1]
+    b_h_scale = dequants_biases[1].args[1]
+
+    args = (
+        other_inputs[0],
+        other_inputs[1],
+        weights_inputs[0],
+        w_i_scale,
+        weights_inputs[1],
+        w_h_scale,
+        bias_inputs[0],
+        b_i_scale,
+        bias_inputs[1],
+        b_h_scale,
+    )
+    kwargs = {}
+
     return args, kwargs
 
 
@@ -471,7 +578,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                 pattern.partition_types(),
             )
             for fused_partition in fused_partitions:
-                anchors = pattern.get_anchors(graph_module, fused_partition)
+                anchors, op_node = pattern.get_anchors(graph_module, fused_partition)
                 if not anchors or anchors.empty:
                     continue
                 if any(self.is_fused(p.nodes) for p in fused_partition):
@@ -512,13 +619,10 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                 bias_inputs = [node.args[0] for node in dequants_biases]
                 other_inputs = [node.args[idx] for node, idx in anchors.others]
 
-                # The node is the first index of the list and first of the tuple
-                anchor_output_node = anchors.output[0][0]
-
-                assert len(anchor_output_node.users) == 1
-                quant_node = list(anchor_output_node.users.keys())[0]
+                assert op_node is not None, "op_node is None"
+                quant_node = list(op_node.users.keys())[0]
 
-                with graph_module.graph.inserting_after(anchor_output_node):
+                with graph_module.graph.inserting_after(op_node):
                     args = tuple(
                         inputs_inputs + weights_inputs + other_inputs + bias_inputs
                     )
@@ -532,7 +636,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                         )
                     elif isinstance(pattern, CatPattern):
                         args, kwargs = get_args_and_kwargs_cat(
-                            inputs_inputs, other_inputs, anchor_output_node
+                            inputs_inputs, other_inputs, op_node
                         )
                     elif isinstance(pattern, ConvReluPatterns):
                         # For ConvReLU, we are fusing Conv+ReLU
@@ -563,7 +667,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_weights,
                             bias_inputs,
                             quant_node,
-                            anchor_output_node,
+                            op_node,
                         )
                     elif isinstance(pattern, LinearPattern):
                         args, kwargs = get_args_and_kwargs_linear(
@@ -618,20 +722,57 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             inputs_inputs,
                             dequants_inputs,
                             quant_node,
-                            anchor_output_node,
+                            op_node,
                         )
+                    elif isinstance(pattern, MixedW8A32LinearPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_linear(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                        )
+                    elif isinstance(pattern, MixedW8A32ConvPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_conv(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                            op_node,
+                        )
+                    elif isinstance(pattern, MixedW8A32GruPattern):
+                        args, kwargs = get_args_and_kwargs_mixed_w8a32_gru(
+                            graph_module,
+                            other_inputs,
+                            weights_inputs,
+                            dequants_weights,
+                            bias_inputs,
+                            dequants_biases,
+                            op_node,
+                        )
+
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
                         args,
                         kwargs,
                     )
-                    fused.meta = quant_node.meta
-                    quant_node.replace_all_uses_with(fused)
+
+                    if len(anchors.output) > 0:
+                        fused.meta = quant_node.meta
+                        quant_node.replace_all_uses_with(fused)
+                    else:
+                        fused.meta = op_node.meta
+                        op_node.replace_all_uses_with(fused)
+                        if op_node.op == "output":
+                            _ = graph_module.graph.output((fused,))
 
             legalize_graph(graph_module)
             graph_module.graph.eliminate_dead_code()
-            # pyre-fixme[7]: Incompatible return type
             graph_module.recompile()
+        return PassResult(graph_module, True)
 
     @classmethod
     # pyre-ignore[2]: Parameter `nodes` has no type specified
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 33b476f5120..2452cfdcfea 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -8,7 +8,7 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 import torch
 from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams
@@ -67,7 +67,7 @@ def partition_types(self) -> list[OpOverload]:
     @abstractmethod
     def get_anchors(
         self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> Optional[PartitionAnchors]:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         pass
 
     @abstractmethod
@@ -85,7 +85,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         addmm_node = fused_partition[0].nodes[-1]
 
@@ -101,11 +101,14 @@ def get_anchors(
             qscheme=torch.per_tensor_affine,
         )
 
-        return PartitionAnchors(
-            inputs=[(addmm_node, 1)],
-            weights=[(addmm_node, 2)],
-            biases=[(addmm_node, 0, bias_qspec)],
-            output=[(addmm_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(addmm_node, 1)],
+                weights=[(addmm_node, 2)],
+                biases=[(addmm_node, 0, bias_qspec)],
+                output=[(addmm_node,)],
+            ),
+            addmm_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -118,7 +121,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         add_node = fused_partition[0].nodes[-1]
 
@@ -129,15 +132,21 @@ def get_anchors(
             add_node.args[1], fx.Node
         )
         if not is_tensor_add or len(add_node.kwargs) > 0:
-            return PartitionAnchors(
-                empty=True,
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                add_node,
             )
 
-        return PartitionAnchors(
-            inputs=[(add_node, 0), (add_node, 1)],
-            weights=[],
-            biases=[],
-            output=[(add_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(add_node, 0), (add_node, 1)],
+                weights=[],
+                biases=[],
+                output=[(add_node,)],
+            ),
+            add_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -150,15 +159,18 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         bmm_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
-            inputs=[(bmm_node, 0), (bmm_node, 1)],
-            weights=[],
-            biases=[],
-            output=[(bmm_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(bmm_node, 0), (bmm_node, 1)],
+                weights=[],
+                biases=[],
+                output=[(bmm_node,)],
+            ),
+            bmm_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -171,7 +183,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         cat_node = fused_partition[0].nodes[-1]
 
@@ -198,13 +210,16 @@ def get_anchors(
                 )
             )
 
-        return PartitionAnchors(
-            inputs=args,
-            weights=[],
-            biases=[],
-            output=[
-                (cat_node, SharedQuantizationSpec((cat_node.args[0][0], cat_node)))
-            ],
+        return (
+            PartitionAnchors(
+                inputs=args,
+                weights=[],
+                biases=[],
+                output=[
+                    (cat_node, SharedQuantizationSpec((cat_node.args[0][0], cat_node)))
+                ],
+            ),
+            cat_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -217,7 +232,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv1d_node = fused_partition[0].nodes[-1]
 
@@ -238,16 +253,19 @@ def get_anchors(
         if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None:
             bias = [(conv1d_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
-            inputs=[(conv1d_node, 0)],
-            weights=[(conv1d_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
-            biases=bias,
-            output=[(conv1d_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(conv1d_node, 0)],
+                weights=[(conv1d_node, 1)],
+                # pyre-fixme[6]: Incompatible parameter type
+                biases=bias,
+                output=[(conv1d_node,)],
+            ),
+            conv1d_node,
         )
 
     def replacement_op(self) -> OpOverload:
-        return torch.ops.cadence.quantized_conv_nchw.default
+        return torch.ops.cadence.quantized_conv2d_nchw.default
 
 
 class Conv2dPattern(QuantizationPattern):
@@ -256,7 +274,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv2d_node = fused_partition[0].nodes[-1]
 
@@ -277,16 +295,19 @@ def get_anchors(
         if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None:
             bias = [(conv2d_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
-            inputs=[(conv2d_node, 0)],
-            weights=[(conv2d_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
-            biases=bias,
-            output=[(conv2d_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(conv2d_node, 0)],
+                weights=[(conv2d_node, 1)],
+                # pyre-fixme[6]: Incompatible parameter type
+                biases=bias,
+                output=[(conv2d_node,)],
+            ),
+            conv2d_node,
         )
 
     def replacement_op(self) -> OpOverload:
-        return torch.ops.cadence.quantized_conv_nchw.default
+        return torch.ops.cadence.quantized_conv2d_nchw.default
 
 
 class LayerNormPattern(QuantizationPattern):
@@ -295,7 +316,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         layer_norm_node = fused_partition[0].nodes[-1]
 
@@ -311,13 +332,16 @@ def get_anchors(
 
         # Weights are used in quantized mode by our kernel, so they are
         # passed in as others here along with the normalized shape.
-        return PartitionAnchors(
-            inputs=[(layer_norm_node, 0)],
-            weights=[],
-            biases=[],
-            # Ordering: normalized_shape, weights, bias
-            others=others,
-            output=[(layer_norm_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(layer_norm_node, 0)],
+                weights=[],
+                biases=[],
+                # Ordering: normalized_shape, weights, bias
+                others=others,
+                output=[(layer_norm_node,)],
+            ),
+            layer_norm_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -330,7 +354,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         linear_node = fused_partition[0].nodes[-1]
 
@@ -351,12 +375,15 @@ def get_anchors(
         if len(linear_node.args) > 2:
             bias = [(linear_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
-            inputs=[(linear_node, 0)],
-            weights=[(linear_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
-            biases=bias,
-            output=[(linear_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(linear_node, 0)],
+                weights=[(linear_node, 1)],
+                # pyre-fixme[6]: Incompatible parameter type
+                biases=bias,
+                output=[(linear_node,)],
+            ),
+            linear_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -369,15 +396,18 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         matmul_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
-            inputs=[(matmul_node, 0), (matmul_node, 1)],
-            weights=[],
-            biases=[],
-            output=[(matmul_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(matmul_node, 0), (matmul_node, 1)],
+                weights=[],
+                biases=[],
+                output=[(matmul_node,)],
+            ),
+            matmul_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -392,15 +422,18 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         relu_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
-            inputs=[(relu_node, 0)],
-            weights=[],
-            biases=[],
-            output=[(relu_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(relu_node, 0)],
+                weights=[],
+                biases=[],
+                output=[(relu_node,)],
+            ),
+            relu_node,
         )
 
     def replacement_op(self) -> OpOverload:
@@ -427,7 +460,7 @@ def partition_types(self) -> List[OpOverload]:
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # The first node should be conv, the second should be relu
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         conv_node = fused_partition[0].nodes[-1]  # Second to last node
@@ -451,16 +484,19 @@ def get_anchors(
         if len(conv_node.args) > 2 and conv_node.args[2] is not None:
             bias = [(conv_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
-            inputs=[(conv_node, 0)],
-            weights=[(conv_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
-            biases=bias,
-            output=[(relu_node,)],  # Output is from the relu node
+        return (
+            PartitionAnchors(
+                inputs=[(conv_node, 0)],
+                weights=[(conv_node, 1)],
+                # pyre-fixme[6]: Incompatible parameter type
+                biases=bias,
+                output=[(relu_node,)],  # Output is from the relu node
+            ),
+            relu_node,
         )
 
     def replacement_op(self) -> OpOverload:
-        return torch.ops.cadence.quantized_conv_nchw.default
+        return torch.ops.cadence.quantized_conv2d_nchw.default
 
 
 # Conv1d + regular relu op fusion
@@ -488,22 +524,197 @@ def partition_types(self) -> List[OpOverload]:
 
 
 class SoftmaxPattern(QuantizationPattern):
-
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten._softmax.default]
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
+    ) -> Tuple[PartitionAnchors, fx.Node]:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         softmax_node = fused_partition[0].nodes[-1]
 
-        return PartitionAnchors(
-            inputs=[(softmax_node, 0)],
-            weights=[],
-            biases=[],
-            output=[(softmax_node,)],
+        return (
+            PartitionAnchors(
+                inputs=[(softmax_node, 0)],
+                weights=[],
+                biases=[],
+                output=[(softmax_node,)],
+            ),
+            softmax_node,
         )
 
     def replacement_op(self) -> OpOverload:
         return torch.ops.cadence.quantized_softmax.default
+
+
+class MixedW8A32LinearPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.linear.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-ignore[29]
+        linear_layer = fused_partition[0].nodes[-1]
+
+        # Bail if the arguments have different shapes than expected
+        if len(linear_layer.args) != 3 or len(linear_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+
+        input_node = linear_layer.args[0]
+        input_shape = input_node.meta["tensor_meta"].shape
+
+        # Bail if the weights are not multiple of 4 (SIMD)
+        if input_shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+        # Currenly only supporting vector-matrix multiplication
+        if len(input_shape) > 0 and input_shape[-2] != 1:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                linear_layer,
+            )
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                weights=[(linear_layer, 1)],
+                biases=[(linear_layer, 2)],
+                output=[],
+                others=[(linear_layer, 0)],
+            ),
+            linear_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_linear.default
+
+
+class MixedW8A32ConvPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.conv1d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-ignore[29]
+        conv_layer = fused_partition[0].nodes[-1]
+
+        # Bail if the arguments have different shapes than expected
+        # Stride, padding, dilation and groups are not supported
+        if len(conv_layer.args) != 3 or len(conv_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                conv_layer,
+            )
+
+        cnn_weights = conv_layer.args[1]
+        if hasattr(cnn_weights.meta, "tensor_meta"):
+            cnn_weights_shape = cnn_weights.meta["tensor_meta"].shape
+            # Bail if the channels are not multiple of 4 (SIMD)
+            if cnn_weights_shape[0] % 4 != 0:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+            if cnn_weights_shape[1] % 4 != 0:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+            # Bail if the kernel size is not 3
+            if cnn_weights_shape[2] != 3:
+                return (
+                    PartitionAnchors(
+                        empty=True,
+                    ),
+                    conv_layer,
+                )
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                weights=[(conv_layer, 1)],
+                biases=[(conv_layer, 2)],
+                output=[],
+                others=[(conv_layer, 0)],
+            ),
+            conv_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_conv.default
+
+
+class MixedW8A32GruPattern(QuantizationPattern):
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten.gru.input]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> Tuple[PartitionAnchors, fx.Node]:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        gru_layer = fused_partition[0].nodes[-1]
+        if len(gru_layer.kwargs) > 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+
+        # Bail if input or states are not multiple of 4 (SIMD)
+        if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+        if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0:
+            return (
+                PartitionAnchors(
+                    empty=True,
+                ),
+                gru_layer,
+            )
+
+        class Wrapper:  # noqa: B903
+            def __init__(self, args, meta):
+                self.args = args
+                self.meta = meta
+
+        wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta)
+
+        return (
+            PartitionAnchors(
+                inputs=[],
+                # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`.
+                weights=[(wrapper, 0), (wrapper, 1)],
+                # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`.
+                biases=[(wrapper, 2), (wrapper, 3)],
+                output=[],
+                others=[(gru_layer, 0), (gru_layer, 1)],
+            ),
+            gru_layer,
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_w8a32_gru.default
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
index ad5f935173e..d4af074c475 100644
--- a/backends/cadence/aot/quantizer/quantizer.py
+++ b/backends/cadence/aot/quantizer/quantizer.py
@@ -24,6 +24,9 @@
     LayerNormPattern,
     LinearPattern,
     MatmulPattern,
+    MixedW8A32ConvPattern,
+    MixedW8A32GruPattern,
+    MixedW8A32LinearPattern,
     QuantizationPattern,
     ReluPattern0,
     ReluPattern1,
@@ -109,6 +112,13 @@
     None,
 )
 
+qconfig_A32W8sym = QuantizationConfig(
+    input_activation=None,
+    output_activation=None,
+    weight=wgt_qspec_sym8s,
+    bias=wgt_qspec_sym8s,
+)
+
 
 class CadenceAtenQuantizer(Quantizer):
     def __init__(
@@ -133,7 +143,7 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
             if not no_outside_users(fused_partition):
                 continue
 
-            anchors = self.pattern.get_anchors(model, fused_partition)
+            anchors, _ = self.pattern.get_anchors(model, fused_partition)
             if not anchors or anchors.empty:
                 continue
             if is_annotated(
@@ -302,6 +312,26 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         super().__init__(quantizers)
 
 
+class CadenceW8A32MixedQuantizer(CadenceQuantizer):
+    """
+    Quantizer for mixed quantization, 8 bit weights and 32 bit activations
+    TODO: Experimental quantizer, not yet well supported in OSS
+    """
+
+    def __init__(self) -> None:
+        quantizers = []
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32LinearPattern(), qconfig_A32W8sym)
+        )
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32ConvPattern(), qconfig_A32W8sym)
+        )
+        quantizers.append(
+            CadenceAtenQuantizer(MixedW8A32GruPattern(), qconfig_A32W8sym)
+        )
+        super().__init__(quantizers)
+
+
 class CadenceWithSoftmaxQuantizer(CadenceQuantizer):
     """
     Quantizer including A16 softmax
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 2a53c2dde7a..ed9bb438a9e 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -6,16 +6,17 @@
 
 # pyre-strict
 
-
 from typing import Callable
 
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 
 from executorch.exir.scalar_type import ScalarType
 from torch.library import impl, Library
 
-
 m = Library("cadence", "IMPL", "CompositeExplicitAutograd")
+torch.ops.load_library("//executorch/kernels/quantized:custom_ops_generated_lib")
 
 qdtype_map: dict[ScalarType, torch.dtype] = {
     ScalarType.QINT8: torch.qint8,
@@ -38,7 +39,7 @@ def quantize_per_tensor(
 
     Args:
         - input_tensor (Tensor): input tensor
-        - scale (float): Inverse of quantization scale. Derived from the ratio
+        - scale (float): Quantization scale. Derived from the ratio
             between the min/max of the floating-point tensor and the
             min/max of the quantized range, and then inverted.
         - zero_point (int): The point which represents 0 in the quantized
@@ -61,13 +62,16 @@ def quantize_per_tensor(
     ]
     if dtype not in supported_quant_types:
         raise ValueError(
-            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_quant_types}"
+            f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_quant_types}"
         )
 
-    quantized = torch.round(input_tensor * scale + zero_point).to(dtype)
-    return torch.max(
-        torch.min(quantized, torch.tensor(quant_max)),
-        torch.tensor(quant_min),
+    return torch.ops.quantized_decomposed.quantize_per_tensor(
+        input_tensor,
+        scale,
+        zero_point,
+        quant_min,
+        quant_max,
+        dtype,
     )
 
 
@@ -97,7 +101,7 @@ def dequantize_per_tensor(
             is already provided.
         - quant_max (int): The largest value in the quantized domain. Unused since scale
             is already provided.
-        - dtype (torch.dtype): The type of the output tensor. Must be a floating point type.
+        - dtype (torch.dtype): The type of the input tensor.
     """
     supported_quant_types = [
         torch.int8,
@@ -108,23 +112,15 @@ def dequantize_per_tensor(
     ]
     if input_tensor.dtype not in supported_quant_types:
         raise ValueError(f"Input dtype must be one of {supported_quant_types}")
-    supported_dequant_types = [
-        torch.float,
-        torch.float32,
-        torch.float16,
-        torch.bfloat16,
-    ]
-    if dtype not in supported_dequant_types:
-        raise ValueError(
-            f"Unsupported dtype to dequantize to. Supported dtypes must be one of {supported_dequant_types}"
-        )
-
-    # Needed to prevent underflow in cases where the zero_point is larger than
-    # the quantized value.
-    if not input_tensor.dtype.is_signed:
-        input_tensor = input_tensor.to(torch.int32)
-
-    return (input_tensor - zero_point).to(dtype) * scale
+    if input_tensor.dtype != dtype:
+        raise ValueError("Input dtype must match dtype")
+
+    # Use the reference implementation from torch quantized_decomposed library
+    # Unlike quantize_per_tensor, dequantize_per_tensor doesn't have a behavior
+    # difference, since there's no rounding algorithm (just arithmetic).
+    return torch.ops.quantized_decomposed.dequantize_per_tensor(
+        input_tensor, scale, zero_point, quant_min, quant_max, dtype
+    )
 
 
 @impl(m, "quantized_add.per_tensor")
@@ -180,12 +176,10 @@ def quantized_add_per_tensor(
     dequant_X = X_scale * (X - X_zero_point)
     dequant_Y = Y_scale * (Y - Y_zero_point)
 
-    out_scale_inv = 1 / out_scale
-
     # q_min/q_max are unused args
     return quantize_per_tensor(
         dequant_X + dequant_Y,
-        out_scale_inv,
+        out_scale,
         out_zero_point,
         torch.iinfo(dtype).min,
         torch.iinfo(dtype).max,
@@ -259,8 +253,7 @@ def quantized_linear_common(
         - out_zero_point (int): The quantized mapping of zero for the output
         - offset (Tensor): Unused
     """
-    out_scale = -out_multiplier * (1 / (1 << 31)) * (2**out_shift)
-    out_scale_inv = 1 / out_scale
+    out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift))
 
     N, K = weight.shape
 
@@ -271,7 +264,7 @@ def quantized_linear_common(
     supported_dtypes = [torch.int8, torch.uint8, torch.int32]
     if dtype not in supported_dtypes:
         raise ValueError(
-            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_dtypes}"
+            f"Unsupported dtype to quantize to {dtype}. Supported dtypes must be one of {supported_dtypes}"
         )
 
     out = torch.nn.functional.linear(
@@ -281,7 +274,7 @@ def quantized_linear_common(
     )
     return quantize_per_tensor(
         out,
-        out_scale_inv,
+        out_scale,
         out_zero_point,
         torch.iinfo(dtype).min,
         torch.iinfo(dtype).max,
@@ -337,8 +330,8 @@ def variant(
                 if out_shift.numel() != 1:
                     raise ValueError("out_shift must be a scalar")
 
-                if out_shift.dtype != torch.int64:
-                    raise ValueError("out_shift must be an int64")
+                if out_shift.dtype != torch.int32:
+                    raise ValueError("out_shift must be an int32")
 
                 _out_shift = int(out_shift.item())
                 _out_multiplier = int(out_multiplier[0].item())
@@ -399,6 +392,17 @@ def quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor() -> torch.Tensor:
 def quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
+@impl(m, "fully_connected")
+def fully_connected(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+) -> torch.Tensor:
+    if input_tensor.shape[0] != 1:
+        raise ValueError("Fully connected linear only supports batch size of 1")
+    return F.linear(input_tensor, weight, bias)
+
+
 @impl(m, "quantized_matmul")
 def quantized_matmul(
     X: torch.Tensor,
@@ -423,25 +427,27 @@ def quantized_matmul(
         - out_multiplier (int): The multiplier used to scale the output
         - out_shift (int): The shift used to scale the output
         - out_zero_point (int): The quantized mapping of zero for the output
-        - transposed (bool): Whether to transpose the weight tensor
+        - transposed (bool): Whether Y is transposed.
     """
     if bias is not None and not torch.all(bias == 0):
         raise ValueError("bias must be None or all zeros since unused in out variant")
 
-    # Looks weird, but quantized linear assumes weights are pre-transposed,
-    # hence we transpose only if `transposed` is False.
-    if not transposed:
-        Y = Y.T
+    if transposed:
+        Y = Y.transpose(-1, -2)
 
-    return quantized_linear_common(
-        X,
-        Y,
-        bias or torch.zeros(1, dtype=torch.int32),
-        X_zero_point,
-        Y_zero_point,
-        out_multiplier,
-        out_shift,
+    out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift))
+
+    out = torch.matmul(
+        (X - X_zero_point).float(),
+        (Y - Y_zero_point).float(),
+    )
+    return quantize_per_tensor(
+        out,
+        out_scale,
         out_zero_point,
+        torch.iinfo(X.dtype).min,
+        torch.iinfo(X.dtype).max,
+        X.dtype,
     )
 
 
@@ -538,7 +544,7 @@ def quantized_layer_norm_per_tensor(
         )
 
     float_input_tensor = dequantize_per_tensor(
-        input_tensor, X_scale, X_zero_point, -128, 127, torch.float32
+        input_tensor, X_scale, X_zero_point, -128, 127, input_tensor.dtype
     )
     out = torch.nn.functional.layer_norm(
         float_input_tensor, normalized_shape, weight, bias, eps=eps
@@ -546,7 +552,7 @@ def quantized_layer_norm_per_tensor(
 
     return quantize_per_tensor(
         out,
-        1 / output_scale,
+        output_scale,
         output_zero_point,
         torch.iinfo(input_tensor.dtype).min,
         torch.iinfo(input_tensor.dtype).max,
@@ -615,7 +621,7 @@ def quantized_conv_per_tensor(
 
     return quantize_per_tensor(
         float_out,
-        1.0 / output_scale,
+        output_scale,
         output_zero_point,
         torch.iinfo(input_tensor.dtype).min,
         torch.iinfo(input_tensor.dtype).max,
@@ -623,8 +629,8 @@ def quantized_conv_per_tensor(
     )
 
 
-@impl(m, "quantized_conv_nchw.per_tensor")
-def quantized_conv_nchw_per_tensor(
+@impl(m, "quantized_conv2d_nchw.per_tensor")
+def quantized_conv2d_nchw_per_tensor(
     input_tensor: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -679,8 +685,8 @@ def quantized_conv_nchw_per_tensor(
     )
 
 
-@impl(m, "quantized_conv_nhwc.per_tensor")
-def quantized_conv_nhwc_per_tensor(
+@impl(m, "quantized_conv2d_nhwc.per_tensor")
+def quantized_conv2d_nhwc_per_tensor(
     input_tensor: torch.Tensor,
     weight: torch.Tensor,
     bias: torch.Tensor,
@@ -800,7 +806,7 @@ def variant(
             # Call the appropriate base function
             match layout:
                 case "nchw":
-                    return quantized_conv_nchw_per_tensor(
+                    return quantized_conv2d_nchw_per_tensor(
                         input_tensor,
                         weight,
                         bias,
@@ -817,7 +823,7 @@ def variant(
                         out_shift,
                     )
                 case "nhwc":
-                    return quantized_conv_nhwc_per_tensor(
+                    return quantized_conv2d_nhwc_per_tensor(
                         input_tensor,
                         weight,
                         bias,
@@ -841,84 +847,248 @@ def variant(
     return decorator
 
 
-@impl(m, "quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nchw", torch.int8, torch.int8)
-def quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nchw", torch.uint8, torch.uint8)
-def quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nhwc", torch.int8, torch.int8)
-def quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nhwc", torch.uint8, torch.uint8)
-def quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nchw", torch.int8, torch.int8)
-def quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nchw", torch.uint8, torch.uint8)
-def quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nhwc", torch.int8, torch.int8)
-def quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nhwc", torch.uint8, torch.uint8)
-def quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nchw", torch.int8, torch.int8)
-def quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor() -> (
+    torch.Tensor
+): ...
 
 
-@impl(m, "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nchw", torch.uint8, torch.uint8)
-def quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor() -> (
+    torch.Tensor
+): ...
 
 
-@impl(m, "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nhwc", torch.int8, torch.int8)
-def quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor() -> (
+    torch.Tensor
+): ...
 
 
-@impl(m, "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nhwc", torch.uint8, torch.uint8)
-def quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> (
+    torch.Tensor
+): ...
 
 
-@impl(m, "quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True)
-def quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True)
-def quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor")
+@impl(m, "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor")
 @quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True)
-def quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
+def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor")
+@impl(m, "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor")
 @quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True)
-def quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
+
+
+@impl(m, "convolution")
+def convolution(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    groups: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    conv_is_1d = len(input_tensor.shape) == 3
+    if channel_last:
+        if conv_is_1d:
+            input_tensor = input_tensor.movedim(-1, 1).contiguous()
+            if len(weight.shape) != 3:
+                raise ValueError("Weight tensor must be 3D if input is 3D")
+            weight = weight.movedim(-1, 1).contiguous()
+        else:
+            input_tensor = input_tensor.movedim(-1, -3)
+            if len(weight.shape) != 4:
+                raise ValueError("Weight tensor must be 4D if input is nd > 3")
+            weight = torch.permute(weight, (0, -1, 1, 2)).contiguous()
+
+    _stride: tuple[int, int] | int = stride
+    _padding: tuple[int, int] | int = padding
+    _dilation: tuple[int, int] | int = dilation
+
+    if conv_is_1d:
+        conv = torch.nn.functional.conv1d
+        _stride = stride[0]
+        _padding = padding[0]
+        _dilation = dilation[0]
+    else:
+        conv = torch.nn.functional.conv2d
+
+    conv_out = conv(input_tensor, weight, bias, _stride, _padding, _dilation, groups)
+    if channel_last:
+        if conv_is_1d:
+            conv_out = conv_out.movedim(1, -1).contiguous()
+        else:
+            conv_out = conv_out.movedim(-3, -1).contiguous()
+
+    return conv_out
+
+
+@impl(m, "transposed_convolution")
+def transposed_convolution(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    output_padding: tuple[int, int],
+    groups: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+
+    conv_is_1d = len(input_tensor.shape) == 3
+    if channel_last:
+        if conv_is_1d:
+            input_tensor = input_tensor.movedim(-1, 1).contiguous()
+            if len(weight.shape) != 3:
+                raise ValueError("Weight tensor must be 3D if input is 3D")
+            weight = weight.movedim(-1, 1).contiguous()
+        else:
+            input_tensor = input_tensor.movedim(-1, -3)
+            if len(weight.shape) != 4:
+                raise ValueError("Weight tensor must be 4D if input is nd > 3")
+            weight = torch.permute(weight, (0, -1, 1, 2)).contiguous()
+
+    _stride: tuple[int, int] | int = stride
+    _padding: tuple[int, int] | int = padding
+    _dilation: tuple[int, int] | int = dilation
+    _output_padding: tuple[int, int] | int = output_padding
+    if conv_is_1d:
+        conv = torch.nn.functional.conv_transpose1d
+        _stride = stride[0]
+        _padding = padding[0]
+        _dilation = dilation[0]
+        _output_padding = output_padding[0]
+    else:
+        conv = torch.nn.functional.conv_transpose2d
+
+    conv_out = conv(
+        input_tensor,
+        weight,
+        bias,
+        _stride,
+        _padding,
+        _output_padding,
+        groups,
+        _dilation,
+    )
+    if channel_last:
+        if conv_is_1d:
+            conv_out = conv_out.movedim(1, -1).contiguous()
+        else:
+            conv_out = conv_out.movedim(-3, -1).contiguous()
+
+    return conv_out
+
+
+@impl(m, "avg_pool2d")
+def avg_pool2d(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    ceil_mode: bool = False,
+    count_include_pad: bool = False,
+    divisor_override: int | None = None,
+    in_zero_point: torch.Tensor | None = None,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    if channel_last:
+        raise NotImplementedError("Channel last is not yet supported for avg_pool2d")
+
+    in_dtype = input_tensor.dtype
+    pad_h, pad_w = padding
+    if in_zero_point is not None:
+        # Avg pool2d does not allow non-0 padding,
+        # so we manually pad the input
+        pad_value = in_zero_point.item()
+        if not count_include_pad:
+            # To simulate this, just pad with 0s
+            pad_value = 0
+
+        input_tensor = torch.nn.functional.pad(
+            input_tensor,
+            (pad_w, pad_w, pad_h, pad_h),
+            mode="constant",
+            value=pad_value,
+        ).float()
+
+        padding = (0, 0)
+
+    out = torch.nn.functional.avg_pool2d(
+        input_tensor,
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        divisor_override,
+    )
+
+    if in_zero_point is not None:
+        min_val = torch.iinfo(in_dtype).min
+        max_val = torch.iinfo(in_dtype).max
+        out = torch.clamp(torch.round(out), min_val, max_val)
+
+    return out.to(in_dtype)
 
 
 def quantized_relu_common(
@@ -942,8 +1112,10 @@ def quantized_relu_common(
     if X.dtype not in supported_dtypes:
         raise ValueError(f"X dtype must be one of {supported_dtypes}. Got {X.dtype}")
 
-    out_scale = -out_multiplier * (1 / (1 << 31)) * (2**out_shift)
-    dequantized_X = torch.where(X > X_zero_point, X - X_zero_point, torch.zeros_like(X))
+    out_scale = 1.0 / (-out_multiplier * (1 / (1 << 31)) * (2**out_shift))
+    dequantized_X = torch.where(
+        X > X_zero_point, X - X_zero_point, torch.zeros_like(X)
+    ).to(torch.float32)
     return quantize_per_tensor(
         dequantized_X,
         out_scale,
@@ -955,7 +1127,6 @@ def quantized_relu_common(
 
 
 def quantized_relu_variant(
-    per_tensor: bool,
     dtype: torch.dtype | None = None,
 ) -> Callable[[Callable[..., torch.Tensor]], Callable[..., torch.Tensor]]:
     """Create a quantized relu variant with type checking."""
@@ -963,43 +1134,20 @@ def quantized_relu_variant(
     def decorator(_: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
         def variant(
             X: torch.Tensor,
-            X_zero_point: torch.Tensor | int,
+            X_zero_point: int,
             out_zero_point: int,
-            out_multiplier: torch.Tensor | int,
-            out_shift: torch.Tensor | int,
+            out_multiplier: int,
+            out_shift: int,
         ) -> torch.Tensor:
-            if per_tensor:
-                if dtype and X.dtype != dtype:
-                    raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}")
-
-                assert isinstance(out_shift, int)
-                assert isinstance(out_multiplier, int)
-                _out_shift = out_shift
-                _out_multiplier = out_multiplier
-            else:
-                assert isinstance(out_multiplier, torch.Tensor)
-                if out_multiplier.numel() > 1:
-                    raise ValueError("Only scalar out_multiplier is supported")
-
-                assert isinstance(out_shift, torch.Tensor)
-                if out_shift.numel() > 1:
-                    raise ValueError("Only scalar out_shift is supported")
-
-                assert isinstance(X_zero_point, torch.Tensor)
-                if X_zero_point.shape != X.shape:
-                    raise ValueError(
-                        f"X_zero_point shape must be {X.shape}. Got {X_zero_point.shape}"
-                    )
-
-                _out_multiplier = int(out_multiplier.item())
-                _out_shift = int(out_shift.item())
+            if dtype and X.dtype != dtype:
+                raise ValueError(f"X dtype must be {dtype}. Got {X.dtype}")
 
             return quantized_relu_common(
                 X,
                 X_zero_point,
                 out_zero_point,
-                _out_multiplier,
-                _out_shift,
+                out_multiplier,
+                out_shift,
             )
 
         return variant
@@ -1007,33 +1155,28 @@ def variant(
     return decorator
 
 
-@impl(m, "quantized_relu")
-@quantized_relu_variant(False)
-def quantized_relu() -> torch.Tensor: ...
-
-
 @impl(m, "quantized_relu.per_tensor")
-@quantized_relu_variant(True)
+@quantized_relu_variant()
 def quantized_relu_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_relu_asym8s_asym8s.per_tensor")
-@quantized_relu_variant(True, torch.int8)
+@quantized_relu_variant(torch.int8)
 def quantized_relu_asym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_relu_asym8u_asym8u.per_tensor")
-@quantized_relu_variant(True, torch.uint8)
+@quantized_relu_variant(torch.uint8)
 def quantized_relu_asym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
-@impl(m, "requantize")
-def requantize(
+@impl(m, "requantize.per_tensor")
+def requantize_per_tensor(
     input: torch.Tensor,
-    in_scale: torch.Tensor,
-    in_zero_point: torch.Tensor,
-    out_scale: torch.Tensor,
-    out_zero_point: torch.Tensor,
+    in_scale: float,
+    in_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
     dtype: ScalarType,
 ) -> torch.Tensor:
     if dtype in qdtype_map:
@@ -1042,11 +1185,6 @@ def requantize(
             torch.dequantize(input), out_scale, out_zero_point, qdtype_map[dtype]
         )
 
-    # For in_scale or out_scale other than scalar, it requires quant/dequant
-    # per channel, but the channel dimension value is missing
-    if in_scale.numel() > 1 or out_scale.numel() > 1:
-        raise NotImplementedError("Only scalar scales are supported")
-
     quant_min = torch.iinfo(input.dtype).min
     quant_max = torch.iinfo(input.dtype).max
     # pyre-fixme[6]: This dtype is actually the right one.
@@ -1056,15 +1194,385 @@ def requantize(
     return torch.ops.quantized_decomposed.quantize_per_tensor(
         torch.ops.quantized_decomposed.dequantize_per_tensor(
             input,
-            in_scale.flatten()[0],
-            in_zero_point.flatten()[0],
+            in_scale,
+            in_zero_point,
             quant_min,
             quant_max,
             input.dtype,
         ),
-        out_scale.flatten()[0],
-        out_zero_point.flatten()[0],
+        out_scale,
+        out_zero_point,
         out_quant_min,
         out_quant_max,
         dtype,
     )
+
+
+@impl(m, "rms_norm")
+def rms_norm(
+    X: torch.Tensor,
+    normalized_shape: tuple[int],
+    W: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    return W * nn.RMSNorm(list(normalized_shape), eps=eps, dtype=X.dtype)(X)
+
+
+@impl(m, "where_Scalar")
+def where_Scalar(
+    condition: torch.Tensor,
+    if_true: float,
+    if_false: float,
+) -> torch.Tensor:
+    if condition.dtype != torch.bool:
+        raise ValueError("condition must be a bool tensor")
+
+    return torch.where(condition, if_true, if_false)
+
+
+@impl(m, "rope")
+def rope(
+    input_tensor: torch.Tensor,
+    sin_tensor: torch.Tensor,
+    cos_tensor: torch.Tensor,
+    pos: torch.Tensor | None,
+) -> torch.Tensor:
+    original_shape = input_tensor.shape
+
+    if len(original_shape) not in [4, 5]:
+        raise ValueError(
+            f"Input tensor must be 4D or 5D. Got {len(original_shape)}D tensor"
+        )
+    if original_shape[0] != 1:
+        raise ValueError("Input tensor must have batch size 1")
+    if len(original_shape) == 5:
+        input_tensor = input_tensor.view(
+            input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2], -1
+        )
+
+    _, s, h, hd = input_tensor.shape
+
+    if hd % 2:
+        raise ValueError("Hidden dimension must be divisible by 2")
+
+    if sin_tensor.shape != (s, hd // 2) or cos_tensor.shape != (s, hd // 2):
+        raise ValueError(
+            f"sin_tensor and cos_tensor must have shape {s, hd // 2}. Got {sin_tensor.shape} and {cos_tensor.shape}"
+        )
+
+    if pos is not None:
+        if pos.shape != (input_tensor.shape[1],):
+            raise ValueError(
+                f"pos must have shape {input_tensor.shape[1]}. Got {pos.shape}"
+            )
+        sin_tensor = sin_tensor[pos]
+        cos_tensor = cos_tensor[pos]
+
+    sin_tensor = sin_tensor.unsqueeze(1)
+    cos_tensor = cos_tensor.unsqueeze(1)
+
+    x0, x1 = input_tensor[..., ::2], input_tensor[..., 1::2]
+    rotated = torch.cat(
+        [x0 * cos_tensor - x1 * sin_tensor, x0 * sin_tensor + x1 * cos_tensor], dim=-1
+    )
+    return rotated.view(original_shape)
+
+
+@impl(m, "im2row")
+def im2row(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    in_zero_point: torch.Tensor,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    """
+    Converts an input tensor into a 2D matrix where each row is a flattened sliding window (patch)
+    from the input, suitable for use in convolution as a matrix multiplication (im2row).
+
+    Args:
+        - input_tensor: Input tensor of shape (N, C, H, W) or (N, H, W, C) if channel_last.
+        - kernel_size: Size of the convolution kernel.
+        - dilation: Dilation of the convolution kernel.
+        - padding: Padding to apply to the input.
+        - stride: Stride of the convolution.
+        - in_zero_point : Zero point for input quantization (broadcastable to input).
+        - channel_last: If True, input is in NHWC format, else NCHW.
+
+    Returns:
+        - Tensor of shape (N, num_patches, patch_size)
+    """
+    if len(input_tensor.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input_tensor = input_tensor.unsqueeze(height_dim)
+
+    if in_zero_point is not None:
+        if in_zero_point.numel() != 1 and in_zero_point.shape != (
+            input_tensor.shape[0],
+        ):
+            raise ValueError(
+                f"Input zero point must be a scalar or broadcastable to input shape {input_tensor.shape}"
+            )
+        if in_zero_point.dtype != torch.int32:
+            raise ValueError("Input zero point must be an int32 tensor")
+
+    if channel_last:
+        input_tensor = input_tensor.movedim(-1, -3).contiguous()  # NHWC -> NCHW
+
+    N, C, H, W = input_tensor.shape
+    kH, kW = kernel_size
+    dH, dW = dilation
+    pH, pW = padding
+    sH, sW = stride
+
+    # Handle padding with zero point values
+    if in_zero_point is not None and (pH > 0 or pW > 0):
+        # Expand zero point to (N, 1, 1, 1) for broadcasting
+        in_zero_point = in_zero_point.expand(N)
+
+        # Pad input with the per-batch zero point values
+        input_tensor = torch.stack(
+            [
+                torch.nn.functional.pad(
+                    input_tensor[i],
+                    (pW, pW, pH, pH),
+                    mode="constant",
+                    value=in_zero_point[i].item(),
+                )
+                for i in range(len(input_tensor))
+            ]
+        )
+
+        padding = (0, 0)  # Already padded manually
+
+    # Use unfold to extract sliding local blocks
+    # Unfold: (N, C, H, W) -> (N, C, L, kH, kW), where L = number of sliding windows
+    # torch.nn.functional.unfold returns (N, C*kH*kW, L)
+    patches = torch.nn.functional.unfold(
+        input_tensor.float(),  # unfold not implemented for int
+        kernel_size=(kH, kW),
+        dilation=(dH, dW),
+        padding=padding,
+        stride=(sH, sW),
+    ).to(
+        input_tensor.dtype
+    )  # (N, C*kH*kW, L)
+
+    # Transpose to (N, L, C*kH*kW)
+    patches = patches.transpose(1, 2).contiguous()
+
+    # Reshape to (N*L, C*kH*kW)
+    patches = patches.view(N, -1, C * kH * kW)
+
+    # If channel_last, output should be in NHWC patch order (but im2row is always row-major)
+    return patches
+
+
+@impl(m, "im2row.per_tensor")
+def im2row_per_tensor(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    in_zero_point: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    return im2row(
+        input_tensor,
+        kernel_size,
+        dilation,
+        padding,
+        stride,
+        torch.tensor(in_zero_point, dtype=torch.int32),
+        channel_last,
+    )
+
+
+@impl(m, "transposed_im2row")
+def transposed_im2row(
+    input_tensor: torch.Tensor,
+    kernel_size: tuple[int, int],
+    dilation: tuple[int, int],
+    padding: tuple[int, int],
+    stride: tuple[int, int],
+    output_padding: tuple[int, int],
+    in_zero_point: torch.Tensor,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    """
+    Converts input tensor patches into im2row format for transposed convolutions.
+    This function extracts patches from input in a pattern suitable for transposed convolution.
+
+    Args:
+        - input_tensor: Input spatial tensor, NCHW or NHWC format (3D or 4D).
+        - kernel_size: Size of the convolution kernel.
+        - dilation: Dilation of the convolution kernel.
+        - padding: Padding to apply to the input.
+        - stride: Stride of the convolution.
+        - output_padding: Additional output padding for transposed convolution.
+        - in_zero_point: Zero point for input quantization (broadcastable to input).
+        - channel_last: If True, input is in NHWC format, else NCHW.
+
+    Returns:
+        - 3D tensor of shape (N, output_h * output_w, kernel_h * kernel_w * in_c)
+    """
+    # Handle 1D convolution case by adding height dimension
+    if len(input_tensor.shape) == 3:
+        height_dim = 1 if channel_last else 2
+        input_tensor = input_tensor.unsqueeze(height_dim)
+
+    if in_zero_point is not None:
+        if in_zero_point.dtype != torch.int32:
+            raise ValueError("Input zero point must be an int32 tensor")
+
+    # Move to NCHW for processing if needed
+    if channel_last:
+        input_tensor = input_tensor.movedim(-1, -3).contiguous()  # NHWC -> NCHW
+
+    N, C, H_in, W_in = input_tensor.shape
+
+    # Output: (N, C*H_in*W_in, H_out, W_out)
+    H_out = (
+        (H_in - 1) * stride[0]
+        + kernel_size[0]
+        + output_padding[0]
+        - 2 * padding[0]
+        + dilation[0] * (kernel_size[0] - 1)
+    )
+    W_out = (
+        (W_in - 1) * stride[1]
+        + kernel_size[1]
+        + output_padding[1]
+        - 2 * padding[1]
+        + dilation[1] * (kernel_size[1] - 1)
+    )
+
+    # For each input pixel, create a channel where the upsampled (transposed conv) patch is placed
+    # Output: (N, C*H_in*W_in, H_out, W_out)
+    inp_flat = input_tensor.reshape(N, C * H_in * W_in)
+
+    # Calculate output spatial size
+    H_out = (
+        (H_in - 1) * stride[0]
+        - 2 * padding[0]
+        + dilation[0] * (kernel_size[0] - 1)
+        + output_padding[0]
+        + 1
+    )
+    W_out = (
+        (W_in - 1) * stride[1]
+        - 2 * padding[1]
+        + dilation[1] * (kernel_size[1] - 1)
+        + output_padding[1]
+        + 1
+    )
+
+    # Compute the upsampled (top-left) position for each input pixel
+    h_idx = torch.arange(H_in, device=input_tensor.device)
+    w_idx = torch.arange(W_in, device=input_tensor.device)
+    grid_h, grid_w = torch.meshgrid(h_idx, w_idx, indexing="ij")
+    out_h_idx = grid_h * stride[0] - padding[0]
+    out_w_idx = grid_w * stride[1] - padding[1]
+
+    # Compute all input pixel positions (flattened)
+    ch_idx = torch.arange(C * H_in * W_in, device=input_tensor.device)
+    ij_idx = ch_idx % (H_in * W_in)
+    i_idx = ij_idx // W_in
+    j_idx = ij_idx % W_in
+
+    # For each input pixel, compute the output positions for the kernel window
+    kh_idx = torch.arange(kernel_size[0], device=input_tensor.device)
+    kw_idx = torch.arange(kernel_size[1], device=input_tensor.device)
+    kh_grid, kw_grid = torch.meshgrid(kh_idx, kw_idx, indexing="ij")
+    kh_grid = kh_grid.reshape(-1)
+    kw_grid = kw_grid.reshape(-1)
+    num_kernel = kernel_size[0] * kernel_size[1]
+
+    # Broadcast to all channels and kernel positions
+    ch_idx_b = ch_idx.repeat_interleave(num_kernel)
+    n_kernel = ch_idx.shape[0] * num_kernel
+
+    i_idx_b = i_idx.repeat_interleave(num_kernel)
+    j_idx_b = j_idx.repeat_interleave(num_kernel)
+    kh_b = kh_grid.repeat(ch_idx.shape[0])
+    kw_b = kw_grid.repeat(ch_idx.shape[0])
+
+    h_out = out_h_idx[i_idx_b, j_idx_b] + kh_b * dilation[0]
+    w_out = out_w_idx[i_idx_b, j_idx_b] + kw_b * dilation[1]
+
+    # Mask for valid output positions
+    valid = (h_out >= 0) & (h_out < H_out) & (w_out >= 0) & (w_out < W_out)
+
+    # Prepare indices for advanced indexing
+    n_idx = (
+        torch.arange(N, device=input_tensor.device)
+        .view(-1, 1)
+        .expand(N, n_kernel)
+        .reshape(-1)
+    )
+    ch_idx_full = ch_idx_b.expand(N, n_kernel).reshape(-1)
+    h_out_full = h_out.expand(N, n_kernel).reshape(-1)
+    w_out_full = w_out.expand(N, n_kernel).reshape(-1)
+    valid_full = valid.expand(N, n_kernel).reshape(-1)
+
+    # Gather input values for each channel
+    inp_vals = inp_flat[:, ch_idx_b].reshape(-1)
+
+    # Create output tensor
+    patches = torch.zeros((N, C * H_in * W_in, H_out, W_out), dtype=input_tensor.dtype)
+
+    # If in_zero_point is provided, fill patches with it
+    if in_zero_point is not None:
+        if in_zero_point.numel() == 1:
+            patches.fill_(in_zero_point.item())
+        else:
+            # Broadcast in_zero_point to (N, C, H_in, W_in)
+            assert in_zero_point.shape == (N,)
+            in_zero_point = in_zero_point.view(N, 1, 1, 1)
+            patches = patches + in_zero_point
+
+    # Scatter input values to output positions (only valid positions)
+    patches[
+        n_idx[valid_full],
+        ch_idx_full[valid_full],
+        h_out_full[valid_full],
+        w_out_full[valid_full],
+    ] = inp_vals[valid_full]
+
+    # Optionally, flatten to (N, num_patches, patch_size) if needed
+    patches = patches.view(N, C * H_in * W_in, -1).transpose(1, 2).contiguous()
+    return patches
+
+
+@impl(m, "quantized_embedding_byte")
+def quantized_embedding_byte(
+    weight: torch.Tensor,
+    weight_scales: torch.Tensor,
+    weight_zero_points: torch.Tensor | None,
+    indices: torch.Tensor,
+    pruned_weights: bool = False,
+) -> torch.Tensor:
+    if pruned_weights:
+        raise NotImplementedError("Pruned weights not supported")
+
+    # Cannot use torch.ops.quantized_decomposed.embedding_byte.dtype because
+    # it doesn't support num_groups == 1
+    num_groups = 1
+    if len(weight_scales.shape) == 2:
+        num_groups = weight_scales.shape[1]
+
+    group_size = weight.shape[1] // num_groups
+    weight = torch.ops.torchao.dequantize_affine.default(
+        input=weight,
+        block_size=(1, group_size),
+        scale=weight_scales,
+        zero_point=weight_zero_points,
+        input_dtype=weight.dtype,
+        quant_min=torch.iinfo(weight.dtype).min,
+        quant_max=torch.iinfo(weight.dtype).max,
+    )
+
+    return weight[indices]
diff --git a/backends/cadence/aot/remove_ops.py b/backends/cadence/aot/remove_ops.py
index 663c5825e52..263d3a521f3 100644
--- a/backends/cadence/aot/remove_ops.py
+++ b/backends/cadence/aot/remove_ops.py
@@ -9,7 +9,7 @@
 
 import logging
 from dataclasses import dataclass, field
-from typing import cast, List, Optional, Sequence, Set
+from typing import cast, List, Optional, Sequence, Set, Type
 
 import torch
 import torch.fx
@@ -926,19 +926,28 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         return super().call(graph_module)
 
 
+class CommonRemovePasses:
+    passes: List[Type[ExportPass]] = [
+        RemoveCloneOpPass,
+        RemoveAliasCopyOpPass,
+        RemoveNopExpandOpPass,
+        RemoveNopSliceOrViewOpPass,
+        RemoveNopSelectOpPass,
+        RemoveToOpsPass,
+        RemoveZeroSizedCatArgsPass,
+        RemovePermutesAroundElementwiseOps,
+        RemoveSqueezeViewBeforeElementwiseOps,
+        RemoveCatFromSliceCopyPass,
+    ]
+
+
 class CadenceRemoveNops:
-    passes = [
+    passes: List[Type[ExportPass]] = CommonRemovePasses.passes + [
         SimplifySliceOpPass,
         RemoveCloneOpsTransformImported,
-        RemoveToOpsPass,
         RemoveNopRequantizeOpPass,
-        RemoveZeroSizedCatArgsPass,
-        RemoveNopSliceOrViewOpPass,
-        RemoveNopExpandOpPass,
         RemoveZeroSizedConstantPadNd,
-        RemoveCloneOpPass,
         RemoveContiguousOpPass,
-        RemoveAliasCopyOpPass,
         RemoveNopMulOpPass,
         RemoveNopAddOpPass,
         RemoveNopLinalgVectorNormOpPass,
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index c575be6e7fc..3cfc059e75b 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -43,7 +43,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
-from torch._subclasses import FakeTensor
 from torch.fx.node import Argument
 
 # A map to represent ops that:
@@ -90,14 +89,10 @@ def replace_logical_nop_where_with_where(
 
             # Get the third arg node and its input
             logical_not_node = node.args[0]
-            logical_not_input_tensor = (
-                logical_not_node.args[0].to_tensor()
-                if isinstance(logical_not_node.args[0], ProxyValue)
-                else logical_not_node.args[0]
-            )
+            logical_not_input_node = logical_not_node.args[0]
 
             # If the logical_not input is not a boolean tensor, bail.
-            if logical_not_input_tensor.meta["spec"].dtype != torch.bool:
+            if logical_not_input_node.meta["val"].dtype != torch.bool:
                 continue
 
             # Replace the where op with another one, flipping the inputs and using the boolean
@@ -263,7 +258,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Glean the shape of input and output tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         in_shape = in_tensor.shape
         out_shape = meta["val"].shape
         # Get the select dimension
@@ -295,7 +290,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Create a zero bias tensor, and insert it as a graph buffer before the
         # current node
-        mat2_tensor = mat2.to_tensor() if isinstance(mat2, ProxyValue) else mat2
+        mat2_tensor = mat2.to_tensor()
         bias_size = mat2_tensor.size(1)
         zero_bias = super().call_operator(
             exir_ops.edge.aten.full.default,
@@ -410,7 +405,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the old dim and new dim order
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         old_dims = tuple(range(in_tensor.dim()))
         new_dims = args[1]
 
@@ -438,11 +433,17 @@ class ReplaceConvolutionOptionalArgsWithConcreteArgsPass(ExportPass):
     """
 
     def call_operator(self, op, args, kwargs, meta):
-        if get_edge_overload_packet(op) != exir_ops.edge.aten.convolution:
+        op_packet = get_edge_overload_packet(op)
+        if op_packet not in {
+            exir_ops.edge.cadence.convolution,
+            exir_ops.edge.cadence.transposed_convolution,
+        }:
             return super().call_operator(op, args, kwargs, meta)
 
+        is_transposed = op_packet == exir_ops.edge.cadence.transposed_convolution
+        expected_args = 9 if is_transposed else 8
+        assert len(args) == expected_args
         # Check if the bias is already concrete
-        assert len(args) == 9
         if args[2] is not None:
             return super().call_operator(op, args, kwargs, meta)
 
@@ -482,11 +483,7 @@ def call_operator(self, op, args, kwargs, meta):
         repeats = args[1]
 
         # Glean the shapes of input tensor
-        in_shape = list(
-            in_tensor.to_tensor().shape
-            if isinstance(in_tensor, ProxyValue)
-            else in_tensor.shape
-        )
+        in_shape = list(in_tensor.to_tensor().shape)
 
         # If the size of repeats is more than the dimensionality of the tensor,
         # the output of repeat will be a higher-dimensional tensor. We reshape
@@ -693,43 +690,27 @@ def call_operator(self, op, args, kwargs, meta):
             # graph operation (in this case a transpose_copy op) to be an explicit
             # ProxyValue as well. If not, the view op can be done directly on the
             # tensor.
-            transposed_weight = (
-                super().call_operator(
-                    exir_ops.edge.aten.transpose_copy.int,
-                    (
-                        weight,
-                        0,
-                        1,
-                    ),
-                    kwargs,
-                    meta,
-                )
-                if isinstance(weight, ProxyValue)
-                else weight.transpose(0, 1)
+            transposed_weight = super().call_operator(
+                exir_ops.edge.aten.transpose_copy.int,
+                (
+                    weight,
+                    0,
+                    1,
+                ),
+                kwargs,
+                meta,
             )
 
-            flipped_weight = (
-                super().call_operator(
-                    exir_ops.edge.aten.flip.default,
-                    (
-                        transposed_weight,
-                        [-1] if transposed_weight.to_tensor().dim() == 3 else [-1, -2],
-                    ),
-                    kwargs,
-                    meta,
-                )
-                if isinstance(transposed_weight, ProxyValue)
-                else (
-                    transposed_weight.flip(-1)
-                    if transposed_weight.dim() == 3
-                    else transposed_weight.flip(-1, -2)
-                )
+            flipped_weight = super().call_operator(
+                exir_ops.edge.aten.flip.default,
+                (
+                    transposed_weight,
+                    [-1] if transposed_weight.to_tensor().dim() == 3 else [-1, -2],
+                ),
+                kwargs,
+                meta,
             )
 
-            # From the previous checks, if flipped_weight is a FakeTensor, it has to be
-            # a constant (if not, it would be a ProxyValue). Mark it as such.
-            if isinstance(flipped_weight, FakeTensor):
-                flipped_weight.constant = flipped_weight
             new_args = (
                 in_tensor,
                 flipped_weight,
@@ -745,16 +726,10 @@ def call_operator(self, op, args, kwargs, meta):
             # Verify that output_padding is 0.
             assert all(
                 x == 0 for x in output_padding
-            ), "Cannot handle padded output in convolution"
+            ), f"Cannot handle padded output in convolution. Got {output_padding=}"
 
-            # If the innermost dim of output tensor is 1, then the stride
-            # should be 1. Note that the first dimension of output tensor is
-            # channel
-            new_stride = stride.copy()
-            out_shape = meta["val"].shape
-            assert out_shape is not None
-            for i, e in enumerate(out_shape[2:]):
-                new_stride[i] = 1 if e == 1 else stride[i]
+            # Keep the original stride to maintain correct output dimensions
+            new_stride = stride
 
             new_args = (
                 in_tensor,
@@ -787,8 +762,8 @@ class ReplaceTrivialConvWithLinear(ExportPass):
 
     trivial_conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = {
         exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default,
-        exir_ops.edge.cadence.quantized_conv_nchw.default: exir_ops.edge.cadence.quantized_linear.default,
-        exir_ops.edge.cadence.quantized_conv_nhwc.default: exir_ops.edge.cadence.quantized_linear.default,
+        exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default,
+        exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default,
     }
 
     def call_operator(self, op, args, kwargs, meta):
@@ -800,8 +775,8 @@ def call_operator(self, op, args, kwargs, meta):
         # extra args holding at least the zero point and scale of input, weight, bias,
         # and output tensor.
         quantized_op = (
-            op == exir_ops.edge.cadence.quantized_conv_nchw.default
-            or op == exir_ops.edge.cadence.quantized_conv_nhwc.default
+            op == exir_ops.edge.cadence.quantized_conv2d_nchw.default
+            or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default
         )
         assert (len(args) == 8 and not quantized_op) or (
             len(args) >= 12 and quantized_op
@@ -809,15 +784,9 @@ def call_operator(self, op, args, kwargs, meta):
         (in_tensor, weight, bias, stride, padding, dilation, groups) = args[0:7]
 
         # Glean the shapes of input, weight, and output
-        in_shape = (
-            in_tensor.to_tensor().shape
-            if isinstance(in_tensor, ProxyValue)
-            else in_tensor.shape
-        )
+        in_shape = in_tensor.to_tensor().shape
 
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         out_shape = meta["val"].shape
         assert None not in {in_shape, weight_shape, out_shape}
 
@@ -839,26 +808,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Reshape the weight to [out_channels, in_channels * X]
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Reshape the input from 3d to 2d tensor
         in_view = super().call_operator(
@@ -881,11 +840,7 @@ def call_operator(self, op, args, kwargs, meta):
                 out_zero_point,
             ) = args[7:12]
             # If the multiplier and shift tensors are provided, use them.
-            if (
-                len(args) >= 14
-                and isinstance(args[12], ProxyValue)
-                and isinstance(args[13], ProxyValue)
-            ):
+            if len(args) >= 14:
                 out_multiplier = args[12]
                 out_shift = args[13]
             # If not, compute them.
@@ -979,18 +934,18 @@ def call_operator(
     ) -> ProxyValue:
         if op not in {
             exir_ops.edge.cadence.convolution.default,
-            exir_ops.edge.cadence.quantized_conv_nchw.default,
+            exir_ops.edge.cadence.quantized_conv2d_nchw.default,
         }:
             return super().call_operator(op, args, kwargs, meta)
 
-        quantized_op = op == exir_ops.edge.cadence.quantized_conv_nchw.default
+        quantized_op = op == exir_ops.edge.cadence.quantized_conv2d_nchw.default
 
         if not quantized_op and len(args) == 8 and args[-1] is True:
             # Already in NHWC layout.
             return super().call_operator(op, args, kwargs, meta)
 
         new_op = (
-            exir_ops.edge.cadence.quantized_conv_nhwc.default
+            exir_ops.edge.cadence.quantized_conv2d_nhwc.default
             if quantized_op
             else exir_ops.edge.cadence.convolution.default
         )
@@ -1067,8 +1022,8 @@ class ReplaceConvWithIm2RowAndLinear(ExportPass):
     # decompose to.
     conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = {
         exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default,
-        exir_ops.edge.cadence.quantized_conv_nchw.default: exir_ops.edge.cadence.quantized_linear.default,
-        exir_ops.edge.cadence.quantized_conv_nhwc.default: exir_ops.edge.cadence.quantized_linear.default,
+        exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default,
+        exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default,
     }
 
     def call_operator(self, op, args, kwargs, meta):
@@ -1077,8 +1032,8 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Get the relevant args from convolution node.
         quantized_op = (
-            op == exir_ops.edge.cadence.quantized_conv_nchw.default
-            or op == exir_ops.edge.cadence.quantized_conv_nhwc.default
+            op == exir_ops.edge.cadence.quantized_conv2d_nchw.default
+            or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default
         )
         assert (len(args) == 8 and not quantized_op) or (
             len(args) >= 12 and quantized_op
@@ -1089,9 +1044,7 @@ def call_operator(self, op, args, kwargs, meta):
         if groups != 1:
             return super().call_operator(op, args, kwargs, meta)
 
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         # If this is a pointwise convolution, im2col will start dominating the
         # runtime. So we call convolution op for this case.
         if (
@@ -1110,7 +1063,7 @@ def call_operator(self, op, args, kwargs, meta):
         # channel_last layout is specified by the channel_last arg of conv
         # op, which is either the last argument (15th) or implicitely False
         # if the op is quantized, or the last argument if not.
-        channel_last = op == exir_ops.edge.cadence.quantized_conv_nhwc.default
+        channel_last = op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default
         # The weight tensor is [out_channels, in_channels, X] for NCHW layout,
         # and [out_channels, X, in_channels] for NHWC layout. Here, X is the
         # kernel_width for conv1d, and X = kernel_height * kernel_width for
@@ -1130,8 +1083,6 @@ def call_operator(self, op, args, kwargs, meta):
                     {"dtype": torch.int32},
                     meta,
                 )
-                if isinstance(in_tensor.to_tensor(), FakeTensor)
-                else get_zero_point(in_tensor.to_tensor())
             )
             if quantized_op
             else torch.tensor(0, dtype=torch.int32)
@@ -1167,26 +1118,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Get the product of the >2 dims of the weight
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Create the linear node, which multiplies the 3d input with 2d weight
         # tensors with bias addition. The outermost dimension of the input is
@@ -1200,11 +1141,7 @@ def call_operator(self, op, args, kwargs, meta):
                 out_zero_point,
             ) = args[7:12]
             # If the multiplier and shift tensors are provided, use them.
-            if (
-                len(args) >= 14
-                and isinstance(args[12], ProxyValue)
-                and isinstance(args[13], ProxyValue)
-            ):
+            if len(args) >= 14:
                 out_multiplier = args[12]
                 out_shift = args[13]
             # If not, compute them.
@@ -1292,9 +1229,7 @@ def call_operator(self, op, args, kwargs, meta):
 
         # Get the shapes
         out_shape = meta["val"].shape
-        weight_shape = (
-            weight.to_tensor().shape if isinstance(weight, ProxyValue) else weight.shape
-        )
+        weight_shape = weight.to_tensor().shape
         assert None not in {weight_shape, out_shape}
 
         # Determine if the transposed_convolution is NCHW or NHWC. The NHWC,
@@ -1348,26 +1283,16 @@ def call_operator(self, op, args, kwargs, meta):
         # Reshape the weight to [out_channels, in_channels * X]
         K = math.prod(weight_shape[1:])
 
-        # If weight is a ProxyValue, linear_weight needs to be the output of a
-        # graph operation (in this case a view_copy op) to be an explicit ProxyValue
-        # as well. If not, the view op can be done directly on the tensor.
-        linear_weight = (
-            super().call_operator(
-                exir_ops.edge.aten.view_copy.default,
-                (
-                    weight,
-                    [weight_shape[0], K],
-                ),
-                kwargs,
-                meta,
-            )
-            if isinstance(weight, ProxyValue)
-            else weight.contiguous().view(weight_shape[0], K)
+        # Weight is always a ProxyValue, so we need a view_copy operation
+        linear_weight = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (
+                weight,
+                [weight_shape[0], K],
+            ),
+            kwargs,
+            meta,
         )
-        # From the previous check, if linear_weight is a FakeTensor, it has to be
-        # a constant (if not, it would be a ProxyValue). Mark it as such.
-        if isinstance(linear_weight, FakeTensor):
-            linear_weight.constant = linear_weight
 
         # Create the linear node, which multiplies the 3d input with 2d weight
         # tensors with bias addition. The outermost dimension of the input is
@@ -1438,7 +1363,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the input tensor and shape
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         in_shape = in_tensor.shape
         # Get the output tensor shape
         out_shape = meta["val"].shape
@@ -1507,7 +1432,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Extract the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         leading_dims = math.prod(in_tensor.shape[:-1])
         # If the tensor is not a vector, do nothing.
         if leading_dims != 1:
@@ -1573,11 +1498,7 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(
             exir_ops.edge.aten.full.default,
             (
-                (
-                    args[0].to_tensor().shape
-                    if isinstance(args[0], ProxyValue)
-                    else args[0].shape
-                ),
+                args[0].to_tensor().shape,
                 args[1],
             ),
             {},
@@ -1618,60 +1539,58 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     replaced_scalar_args: dict[
         EdgeOpOverloadPacket, tuple[EdgeOpOverload, Sequence[int]]
     ] = {
-        exir_ops.edge.cadence.quantized_add: (
+        exir_ops.edge.cadence.quantized_add.default: (
             exir_ops.edge.cadence.quantized_add.per_tensor,
             [1, 2, 4, 5],
         ),
-        exir_ops.edge.cadence.quantized_conv_nchw: (
-            exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
+        exir_ops.edge.cadence.quantized_conv2d_nchw.default: (
+            exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
             [8, 9, 12, 13],
         ),
-        exir_ops.edge.cadence.quantized_conv_nhwc: (
-            exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+        exir_ops.edge.cadence.quantized_conv2d_nhwc.default: (
+            exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
             [8, 9, 12, 13],
         ),
-        exir_ops.edge.cadence.quantized_fully_connected: (
+        exir_ops.edge.cadence.quantized_fully_connected.default: (
             exir_ops.edge.cadence.quantized_fully_connected.per_tensor,
             [4, 5, 6],
         ),
-        exir_ops.edge.cadence.quantized_layer_norm: (
+        exir_ops.edge.cadence.quantized_layer_norm.default: (
             exir_ops.edge.cadence.quantized_layer_norm.per_tensor,
             [1, 2],
         ),
-        exir_ops.edge.cadence.quantized_linear: (
+        exir_ops.edge.cadence.quantized_linear.default: (
             exir_ops.edge.cadence.quantized_linear.per_tensor,
             [4, 5, 6],
         ),
-        exir_ops.edge.cadence.quantized_relu: (
+        exir_ops.edge.cadence.quantized_relu.default: (
             exir_ops.edge.cadence.quantized_relu.per_tensor,
             [1, 3, 4],
         ),
-        exir_ops.edge.cadence.im2row: (
+        exir_ops.edge.cadence.im2row.default: (
             exir_ops.edge.cadence.im2row.per_tensor,
             [5],
         ),
-        exir_ops.edge.cadence.requantize: (
+        exir_ops.edge.cadence.requantize.default: (
             exir_ops.edge.cadence.requantize.per_tensor,
             [1, 2, 3, 4],
         ),
     }
 
     def call_operator(self, op, args, kwargs, meta):
-        op_edge_overload_packet = get_edge_overload_packet(op)
-
-        if op_edge_overload_packet not in self.replaced_scalar_args:
+        if op not in self.replaced_scalar_args:
             return super().call_operator(op, args, kwargs, meta)
 
         # Get all the args that need to be replaced.
-        new_op, args_to_be_replaced = self.replaced_scalar_args[op_edge_overload_packet]
+        new_op, args_to_be_replaced = self.replaced_scalar_args[op]
+
+        if op == new_op:
+            return super().call_operator(op, args, kwargs, meta)
 
         updated_args = list(args)
         for op_arg_index in args_to_be_replaced:
             arg = args[op_arg_index]
-            if not isinstance(arg, ProxyValue):
-                return super().call_operator(op, args, kwargs, meta)
-
-            if not arg.is_tensor():
+            if not isinstance(arg, ProxyValue) or not arg.is_tensor():
                 return super().call_operator(op, args, kwargs, meta)
 
             if not isinstance(arg.node.target, EdgeOpOverload):
@@ -1712,7 +1631,7 @@ def call_operator(self, op, args, kwargs, meta):
         # Determine if the op is avg_pool1d or avg_pool2d
         avg_pool1d: bool = op == exir_ops.edge.aten.avg_pool1d.default
         # Get the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
 
         # Replace avg_pool2d with custom avg_pool2d, and if the input tensor is
         # quantized, pass its zero_point tensor as arg to the custom avg_pool2d.
@@ -1725,7 +1644,7 @@ def call_operator(self, op, args, kwargs, meta):
         ceil_mode = args[4] if len(args) >= 5 else False
         count_include_pad = args[5] if len(args) >= 6 else True
         divisor_override = args[6] if len(args) >= 7 else None
-        zero_point = torch.tensor(0, dtype=torch.int32)
+        zero_point = args[7] if len(args) >= 8 else None
 
         # If the op is avg_pool1d, then we need to reshape the 3d input to a 4d
         # tensor.
@@ -2078,7 +1997,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the second tensor
-        Y_tensor = Y_arg.to_tensor() if isinstance(Y_arg, ProxyValue) else Y_arg
+        Y_tensor = Y_arg.to_tensor()
         # Concretize the bias
         zero_bias = super().call_operator(
             exir_ops.edge.aten.full.default,
@@ -2087,19 +2006,14 @@ def call_operator(self, op, args, kwargs, meta):
             meta,
         )
 
-        # If the arg was a ProxyValue, insert a transpose node. Otherwise we
-        # can simply transpose the tensor inplace.
-        if isinstance(Y_arg, ProxyValue):
-            transpose_args = (Y_arg, -1, -2)
-            transpose_node = super().call_operator(
-                exir_ops.edge.aten.transpose_copy.int,
-                transpose_args,
-                {},
-                meta,
-            )
-            Y_arg_t = transpose_node
-        else:
-            Y_arg_t = Y_tensor.transpose(-1, -2)
+        # Y_arg is always a ProxyValue, so we insert a transpose node
+        transpose_args = (Y_arg, -1, -2)
+        Y_arg_t = super().call_operator(
+            exir_ops.edge.aten.transpose_copy.int,
+            transpose_args,
+            {},
+            meta,
+        )
 
         # Construct the new args, and return the transposed matmult op
         new_args = (
@@ -2194,7 +2108,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the input tensor
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         # Permute NCHW to NHWC for computation
         in_tensor_permuted = in_tensor.permute(0, 2, 3, 1)
         in_tensor_shape = in_tensor_permuted.shape
@@ -2242,6 +2156,52 @@ def call_operator(self, op, args, kwargs, meta):
         )
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding(ExportPass):
+    """
+    Replace torch.ops.quantized_decomposed.embedding_byte.dtype with
+    torch.ops.cadence.quantized_embedding_byte
+    """
+
+    def call_operator(
+        self,
+        op: torch._ops.OpOverload,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        # Check if the op is the quantized_decomposed.embedding_byte.dtype
+        if (
+            op == exir_ops.edge.quantized_decomposed.embedding_byte.default
+            or op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+        ):
+            # Replace with cadence.quantized_embedding_byte
+            if len(args) < 6:
+                raise AssertionError(
+                    f"Expected 6 arguments for embedding_byte, got {len(args)}"
+                )
+            embedding = args[0]
+            scales = args[1]
+            weight_zero_points = args[2]
+            indices = args[5]
+            if op == exir_ops.edge.quantized_decomposed.embedding_byte.dtype:
+                dtype = kwargs.get("dtype", None)
+                if dtype is not None and dtype != torch.float32:
+                    raise AssertionError(
+                        f"Unsupported output dtype for embedding_byte: {dtype}"
+                    )
+
+            new_args = (embedding, scales, weight_zero_points, indices, False)
+            new_kwargs = {}
+            return super().call_operator(
+                exir_ops.edge.cadence.quantized_embedding_byte.default,
+                new_args,
+                new_kwargs,
+                meta,
+            )
+        return super().call_operator(op, args, kwargs, meta)
+
+
 class CommonReplacePasses:
     passes = [
         ReplaceSqueezeAndUnsqueezeWithViewPass,
@@ -2251,6 +2211,10 @@ class CommonReplacePasses:
         ReplaceRepeatWithCatPass,
         ReplaceFullLikeWithFullPass,
         ReplaceAtenConvolutionWithCadenceConvolutionPass,
+        ReplacePT2QuantWithCadenceQuantPass,
+        ReplacePT2DequantWithCadenceDequantPass,
+        ReplacePowWithMulPass,
+        ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding,
     ]
 
 
@@ -2296,13 +2260,10 @@ class CadenceReplaceOpsInGraph:
         ReplaceScalarTensorWithFullPass,
         ReplaceInfArgInFullWithValuePass,
         ReplaceLogicalNotBooleanWhereWithWherePass,
-        ReplacePT2QuantWithCadenceQuantPass,
-        ReplacePT2DequantWithCadenceDequantPass,
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAdaptiveAvgPoolWithAtenAvgPoolPass,
         ReplaceAtenAvgPoolWithCadenceAvgPoolPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
         ReplaceAtenApproxGeluWithApproxGeluPass,
-        ReplacePowWithMulPass,
         ReplaceMulTensorWithMulAndFullOpsPass,
     ]
diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py
index bf836f09044..92c14cb0f5d 100644
--- a/backends/cadence/aot/simplify_ops.py
+++ b/backends/cadence/aot/simplify_ops.py
@@ -19,7 +19,7 @@
 from executorch.backends.cadence.aot.utils import rebind
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, ProxyValue
+from executorch.exir.pass_base import ExportPass
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -75,7 +75,7 @@ def call_operator(self, op, args, kwargs, meta):
         slice_scatter = op == exir_ops.edge.aten.slice_scatter.default
         # Parse the arguments
         # Extract the tensor to be sliced, and the slicing dimension
-        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        in_tensor = args[0].to_tensor()
         dim = args[1 + slice_scatter] if len(args) > 1 + slice_scatter else 0
         # Make dim non-negative
         dim = dim if dim >= 0 else dim + in_tensor.dim()
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
index 30b30e085dc..259752f3893 100644
--- a/backends/cadence/aot/tests/test_ref_implementations.py
+++ b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -36,12 +36,11 @@ def test_quantize_per_tensor(
     ) -> None:
         input_tensor = torch.tensor([input_value])
         scale = (f_max - f_min) / (q_max - q_min)
-        inv_scale = 1.0 / scale
-        zero_point = round(-f_min * inv_scale) + q_min
+        zero_point = round(-f_min * 1 / scale) + q_min
         expected_output = torch.tensor([expected_value], dtype=target_dtype)
 
         output = torch.ops.cadence.quantize_per_tensor(
-            input_tensor, inv_scale, zero_point, q_min, q_max, target_dtype
+            input_tensor, scale, zero_point, q_min, q_max, target_dtype
         )
 
         self.assertEqual(
@@ -85,7 +84,7 @@ def test_dequantize_per_tensor(
         expected_output = torch.tensor([expected_value], dtype=torch.float32)
 
         output = torch.ops.cadence.dequantize_per_tensor(
-            input_tensor, scale, zero_point, q_min, q_max, torch.float32
+            input_tensor, scale, zero_point, q_min, q_max, input_tensor.dtype
         )
 
         self.assertEqual(
@@ -173,9 +172,9 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
-                    torch.tensor([[-2]], dtype=dtype),  # expected_output
+                    torch.tensor([[0]], dtype=dtype),  # expected_output
                     per_tensor,
                     False,
                     False,
@@ -198,9 +197,9 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
-                    torch.tensor([[-10, -30]], dtype=dtype),  # expected_output
+                    torch.tensor([[-2, -8]], dtype=dtype),  # expected_output
                     per_tensor,
                     False,
                     False,
@@ -208,6 +207,28 @@ def test_quantized_add(
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
                     (True, torch.int8),
+                )
+            ],
+            *[
+                (
+                    torch.Size([1, 3]),  # src_shape: 1 sample, 3 input features
+                    torch.Size(
+                        [2, 3]
+                    ),  # weight_shape: 2 output features, 3 input features
+                    0,  # in_zero_point
+                    torch.tensor([0, 0, 0], dtype=dtype),  # weight_zero_point
+                    torch.tensor(
+                        [1073741824], dtype=torch.int32
+                    ),  # out_multiplier (0.5 * 2^31)
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
+                    0,  # out_zero_point
+                    torch.tensor([[0, 0]], dtype=dtype),  # expected_output
+                    per_tensor,
+                    False,
+                    False,
+                )
+                for (per_tensor, dtype) in (
+                    (False, torch.uint8),
                     (True, torch.uint8),
                 )
             ],
@@ -223,10 +244,10 @@ def test_quantized_add(
                     torch.tensor(
                         [1073741824], dtype=torch.int32
                     ),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     0,  # out_zero_point
                     torch.tensor(
-                        [[[-2, -8, -14], [-6, -28, -50]]], dtype=dtype
+                        [[[0, -2, -4], [-2, -7, -12]]], dtype=dtype
                     ),  # expected_output
                     per_tensor,
                     False,
@@ -235,7 +256,6 @@ def test_quantized_add(
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
                     (True, torch.int8),
-                    (True, torch.uint8),
                 )
             ],
             # Test case 4: Non-zero zero points
@@ -250,9 +270,9 @@ def test_quantized_add(
                     torch.tensor(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (1.0 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     1,  # out_zero_point
-                    torch.tensor([[-15, 25]], dtype=dtype),  # expected_output
+                    torch.tensor([[1, 1]], dtype=dtype),  # expected_output
                     per_tensor,
                     False,
                     False,
@@ -260,7 +280,7 @@ def test_quantized_add(
                 for (per_tensor, dtype) in (
                     (False, torch.int8),
                     (True, torch.int8),
-                    (True, torch.uint8),
+                    # (True, torch.uint8),
                 )
             ],
             # Test case 5: Non-uniform weight zero points
@@ -275,14 +295,14 @@ def test_quantized_add(
                     torch.tensor(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (1.0 * 2^31)
-                    torch.tensor([0], dtype=torch.int64),  # out_shift
+                    torch.tensor([0], dtype=torch.int32),  # out_shift
                     1,  # out_zero_point
-                    torch.tensor([[-23, 17]], dtype=dtype),  # expected_output
+                    torch.tensor([[1, 1]], dtype=dtype),  # expected_output
                     False,
                     False,
                     False,
                 )
-                for dtype in (torch.int8, torch.uint8)
+                for dtype in (torch.int8,)
             ],
             # Test case 6: Non-zero out_shift (shift=1)
             *[
@@ -297,10 +317,10 @@ def test_quantized_add(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (0.125 * 2^31)
                     torch.tensor(
-                        [1], dtype=torch.int64
+                        [1], dtype=torch.int32
                     ),  # out_shift (shift=1, doubles the scale)
                     1,  # out_zero_point
-                    torch.tensor([[-7, 13]], dtype=dtype),  # expected_output
+                    torch.tensor([[1, 2]], dtype=dtype),  # expected_output
                     per_tensor,
                     False,
                     False,
@@ -319,16 +339,39 @@ def test_quantized_add(
                         [268435456], dtype=torch.int32
                     ),  # out_multiplier (0.125 * 2^31)
                     torch.tensor(
-                        [1], dtype=torch.int64
+                        [1], dtype=torch.int32
+                    ),  # out_shift (shift=1, doubles the scale)
+                    1,  # out_zero_point
+                    torch.tensor([[1, 2]], dtype=dtype),  # expected_output
+                    per_tensor,
+                    matmul,
+                    transposed_matmul,
+                )
+                for (matmul, transposed_matmul) in ((True, False), (True, True))
+                for (per_tensor, dtype) in ((True, torch.int8),)
+            ],
+            *[
+                (
+                    torch.Size([2, 1, 2]),  # src_shape: 1 sample, 2 input features
+                    torch.Size(
+                        [2, 2, 2]
+                    ),  # weight_shape: 2 output features, 2 input features
+                    2,  # in_zero_point
+                    torch.tensor([1, 1], dtype=dtype),  # weight_zero_point
+                    torch.tensor(
+                        [268435456], dtype=torch.int32
+                    ),  # out_multiplier (0.125 * 2^31)
+                    torch.tensor(
+                        [1], dtype=torch.int32
                     ),  # out_shift (shift=1, doubles the scale)
                     1,  # out_zero_point
-                    torch.tensor([[-7, 17]], dtype=dtype),  # expected_output
+                    torch.tensor([[[1, 2]], [[0, -1]]], dtype=dtype),  # expected_output
                     per_tensor,
                     matmul,
                     transposed_matmul,
                 )
                 for (matmul, transposed_matmul) in ((True, False), (True, True))
-                for (per_tensor, dtype) in ((True, torch.int8), (True, torch.uint8))
+                for (per_tensor, dtype) in ((True, torch.int8),)
             ],
         ]
     )
@@ -360,7 +403,7 @@ def test_quantized_linear(
             .to(expected_output.dtype)
         )
         if matmul and not transposed_matmul:
-            weight = weight.T
+            weight = weight.transpose(-1, -2)
 
         if per_tensor:
             weight_zero_point = weight_zero_point[0]
@@ -906,9 +949,9 @@ def test_quantized_conv_per_tensor(
 
         convs = [
             (
-                torch.ops.cadence.quantized_conv_nchw.per_tensor
+                torch.ops.cadence.quantized_conv2d_nchw.per_tensor
                 if memory_format == torch.contiguous_format
-                else torch.ops.cadence.quantized_conv_nhwc.per_tensor
+                else torch.ops.cadence.quantized_conv2d_nhwc.per_tensor
             )
         ]
 
@@ -916,30 +959,30 @@ def test_quantized_conv_per_tensor(
         if input_tensor.dtype == torch.int8 and weight.dtype == torch.int8:
             if memory_format == torch.contiguous_format:
                 optimized_convs = [
-                    torch.ops.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor,
-                    torch.ops.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor,
-                    torch.ops.cadence.quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor,
                 ]
 
             else:
                 optimized_convs = [
-                    torch.ops.cadence.quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor,
-                    torch.ops.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor,
-                    torch.ops.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor,
                 ]
         elif input_tensor.dtype == torch.uint8 and weight.dtype == torch.uint8:
             if memory_format == torch.contiguous_format:
                 optimized_convs = [
-                    torch.ops.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor,
-                    torch.ops.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor,
-                    torch.ops.cadence.quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor,
                 ]
 
             else:
                 optimized_convs = [
-                    torch.ops.cadence.quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor,
-                    torch.ops.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor,
-                    torch.ops.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor,
+                    torch.ops.cadence.quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor,
                 ]
 
         convs.extend(optimized_convs)
@@ -1045,21 +1088,20 @@ def test_quantized_conv_per_tensor(
                         [4, 2, 0, -2], dtype=dtype
                     ),  # expected: relu(1,3,5,7) = (1,3,5,7) * (-1.0) + 5 = (4,2,0,-2)
                 )
-                for dtype in [torch.int8, torch.uint8]
+                for dtype in [torch.int8]
             ],
-            # Test case 4: Non-per-tensor
             *[
                 (
-                    "non_per_tensor",
-                    torch.tensor([-1, -2, -3, 1, 2, 3], dtype=dtype),  # input
-                    torch.tensor([0, 0, 0, 1, 1, 1]),  # X_zero_point
+                    "positive_with_shift_unsigned",
+                    torch.tensor([2, 4, 6, 8], dtype=dtype),  # input
+                    1,  # X_zero_point
                     5,  # out_zero_point
-                    torch.tensor([1073741824]),  # out_multiplier (0.5 * 2^31)
-                    torch.tensor([1]),  # out_shift (multiply by 2^1 = 2)
+                    1073741824,  # out_multiplier (0.5 * 2^31)
+                    1,  # out_shift (multiply by 2^1 = 2)
                     dtype,  # dtype
-                    torch.tensor([5, 5, 5, 5, 4, 3], dtype=dtype),
+                    torch.tensor([4, 2, 0, 0], dtype=dtype),
                 )
-                for dtype in [torch.int8]
+                for dtype in [torch.uint8]
             ],
         ]
     )
@@ -1067,41 +1109,33 @@ def test_quantized_relu(
         self,
         name: str,
         X: torch.Tensor,
-        X_zero_point: torch.Tensor | int,
+        X_zero_point: int,
         out_zero_point: int,
-        out_multiplier: torch.Tensor | int,
-        out_shift: torch.Tensor | int,
+        out_multiplier: int,
+        out_shift: int,
         dtype: torch.dtype,
         expected_output: torch.Tensor,
     ) -> None:
 
-        if isinstance(X_zero_point, int):
-            assert isinstance(out_multiplier, int)
-            assert isinstance(out_shift, int)
-
-            match dtype:
-                case torch.int8:
-                    quantized_relu = (
-                        torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor
-                    )
-                case torch.uint8:
-                    quantized_relu = (
-                        torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor
-                    )
-                case _:
-                    quantized_relu = torch.ops.cadence.quantized_relu_per_tensor
+        match dtype:
+            case torch.int8:
+                quantized_relu = (
+                    torch.ops.cadence.quantized_relu_asym8s_asym8s.per_tensor
+                )
+            case torch.uint8:
+                quantized_relu = (
+                    torch.ops.cadence.quantized_relu_asym8u_asym8u.per_tensor
+                )
+            case _:
+                quantized_relu = torch.ops.cadence.quantized_relu_per_tensor
 
-            output = quantized_relu(
-                X,
-                X_zero_point,
-                out_zero_point,
-                out_multiplier,
-                out_shift,
-            )
-        else:
-            output = torch.ops.cadence.quantized_relu(
-                X, X_zero_point, out_zero_point, out_multiplier, out_shift
-            )
+        output = quantized_relu(
+            X,
+            X_zero_point,
+            out_zero_point,
+            out_multiplier,
+            out_shift,
+        )
 
         # Verify output properties
         self.assertEqual(output.dtype, dtype, f"Output dtype should be {dtype}")
@@ -1112,3 +1146,1277 @@ def test_quantized_relu(
             torch.equal(output, expected_output),
             f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
         )
+
+    def test_where_Scalar(self) -> None:
+        input_tensor = torch.tensor([1, 2, 3, 4], dtype=torch.int8)
+        out = torch.ops.cadence.where_Scalar(input_tensor > 2, 1.0, 0.0)
+        self.assertTrue(
+            torch.equal(out, torch.tensor([0.0, 0.0, 1.0, 1.0], dtype=torch.float32))
+        )
+        with self.assertRaises(ValueError) as context:
+            torch.ops.cadence.where_Scalar(input_tensor, 1.0, 0.0)
+
+        self.assertIn("condition must be a bool tensor", str(context.exception))
+
+    @expand(
+        [
+            (
+                "h1xhd4",
+                torch.tensor([[[[1.0, 2.0, 3.0, 4.0]]]], dtype=torch.float32),
+                torch.tensor([[0.0, 0.0]], dtype=torch.float32),
+                torch.tensor([[1.0, 1.0]], dtype=torch.float32),
+                torch.tensor([[[[1.0, 3.0, 2.0, 4.0]]]], dtype=torch.float32),
+            ),
+            (
+                "h2xhd4",
+                torch.tensor(
+                    [[[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]]],
+                    dtype=torch.float32,
+                ),
+                torch.tensor([[0.0, 1.0]], dtype=torch.float32),
+                torch.tensor([[1.0, 0.0]], dtype=torch.float32),
+                torch.tensor(
+                    [[[[1.0, -4.0, 2.0, 3.0], [5, -8.0, 6.0, 7.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "s2xh2xhd4",
+                torch.tensor(
+                    [
+                        [
+                            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]],
+                            [[9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+                torch.tensor([[0.0, 1.0], [0.0, 1.0]], dtype=torch.float32),
+                torch.tensor([[1.0, 0.0], [1.0, 0.0]], dtype=torch.float32),
+                torch.tensor(
+                    [
+                        [
+                            [[1.0, -4.0, 2.0, 3.0], [5.0, -8.0, 6.0, 7.0]],
+                            [[9.0, -12.0, 10.0, 11.0], [13.0, -16.0, 14.0, 15.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "pos_not_none",
+                torch.tensor(
+                    [
+                        [
+                            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]],
+                            [[9.0, 10.0, 11.0, 12.0], [13.0, 14.0, 15.0, 16.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+                torch.tensor([[1.0, 0.0], [0.0, 1.0]], dtype=torch.float32),
+                torch.tensor([[0.0, 1.0], [1.0, 0.0]], dtype=torch.float32),
+                torch.tensor(
+                    [
+                        [
+                            [[1.0, -4.0, 2.0, 3.0], [5.0, -8.0, 6.0, 7.0]],
+                            [[-10.0, 11.0, 9.0, 12.0], [-14.0, 15.0, 13.0, 16.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+                torch.tensor([1, 0]),
+            ),
+        ]
+    )
+    def test_rope(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        sin_tensor: torch.Tensor,
+        cos_tensor: torch.Tensor,
+        expected_output: torch.Tensor,
+        pos: torch.Tensor | None = None,
+    ) -> None:
+        output = torch.ops.cadence.rope(input_tensor, sin_tensor, cos_tensor, pos)
+
+        # Verify output properties
+        self.assertEqual(
+            output.dtype,
+            input_tensor.dtype,
+            f"Output dtype should match input dtype in {name}",
+        )
+        self.assertEqual(
+            output.shape,
+            input_tensor.shape,
+            f"Output shape should match input shape in {name}",
+        )
+
+        # Verify output matches expected values
+        self.assertTrue(
+            torch.allclose(output, expected_output, rtol=1e-4, atol=1e-4),
+            f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            # Test case 1: Basic 2D convolution (NCHW format)
+            (
+                "basic_2d_nchw",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2 (identity-like filter)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                False,  # channel_last
+                torch.tensor(
+                    [[[[5.0]]]], dtype=torch.float32
+                ),  # expected: 1*1 + 4*1 = 5
+            ),
+            # Test case 2: Basic 2D convolution (NHWC format)
+            (
+                "basic_2d_nhwc",
+                torch.tensor(
+                    [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32
+                ),  # input: 1x2x2x1 (NHWC)
+                torch.tensor(
+                    [[[[1.0], [0.0]], [[0.0], [1.0]]]], dtype=torch.float32
+                ),  # weight: 1x2x2x1 (NHWC format)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                True,  # channel_last
+                torch.tensor(
+                    [[[[5.0]]]], dtype=torch.float32
+                ),  # expected: 1*1 + 4*1 = 5
+            ),
+            # Test case 3: 2D convolution with stride=2
+            (
+                "conv2d_stride2",
+                torch.tensor(
+                    [
+                        [
+                            [
+                                [1.0, 2.0, 3.0, 4.0],
+                                [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0],
+                                [13.0, 14.0, 15.0, 16.0],
+                            ]
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # input: 1x1x4x4
+                torch.tensor(
+                    [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2 (sum filter)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (2, 2),  # stride=2
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                False,  # channel_last
+                torch.tensor([[[[14.0, 22.0], [46.0, 54.0]]]], dtype=torch.float32),
+            ),
+            # Test case 4: 2D convolution with padding=1
+            (
+                "conv2d_padding1",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (1, 1),  # padding=1
+                (1, 1),  # dilation
+                1,  # groups
+                False,  # channel_last
+                torch.tensor(
+                    [[[[1.0, 2.0, 0.0], [3.0, 5.0, 2.0], [0.0, 3.0, 4.0]]]],
+                    dtype=torch.float32,
+                ),  # expected with padding
+            ),
+            # Test case 5: 2D convolution with dilation=2
+            (
+                "conv2d_dilation2",
+                torch.tensor(
+                    [
+                        [
+                            [
+                                [1.0, 2.0, 3.0, 4.0],
+                                [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0],
+                                [13.0, 14.0, 15.0, 16.0],
+                            ]
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # input: 1x1x4x4
+                torch.tensor(
+                    [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (2, 2),  # dilation=2
+                1,  # groups
+                False,  # channel_last
+                torch.tensor([[[[24.0, 28.0], [40.0, 44.0]]]], dtype=torch.float32),
+            ),
+            # Test case 6: 2D grouped convolution (groups=2)
+            (
+                "conv2d_groups2",
+                torch.tensor(
+                    [
+                        [
+                            [[1.0, 2.0], [3.0, 4.0]],  # first input channel
+                            [[5.0, 6.0], [7.0, 8.0]],  # second input channel
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # input: 1x2x2x2
+                torch.tensor(
+                    [
+                        [[[1.0, 1.0], [1.0, 1.0]]],  # first group weight
+                        [[[0.5, 0.5], [0.5, 0.5]]],  # second group weight
+                    ],
+                    dtype=torch.float32,
+                ),  # weight: 2x1x2x2
+                torch.tensor([0.0, 1.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                2,  # groups=2
+                False,  # channel_last
+                torch.tensor([[[[10.0]], [[14.0]]]], dtype=torch.float32),
+            ),
+            # Test case 7: 1D convolution (NCL format)
+            (
+                "conv1d_ncl",
+                torch.tensor(
+                    [[[1.0, 2.0, 3.0, 4.0]]], dtype=torch.float32
+                ),  # input: 1x1x4
+                torch.tensor([[[1.0, 1.0]]], dtype=torch.float32),  # weight: 1x1x2
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride (only stride[1] is used for 1D)
+                (0, 0),  # padding (only padding[1] is used for 1D)
+                (1, 1),  # dilation (only dilation[1] is used for 1D)
+                1,  # groups
+                False,  # channel_last
+                torch.tensor(
+                    [[[3.0, 5.0, 7.0]]], dtype=torch.float32
+                ),  # expected: [1+2, 2+3, 3+4]
+            ),
+            # Test case 8: 1D convolution (NLC format)
+            (
+                "conv1d_nlc",
+                torch.tensor(
+                    [[[1.0], [2.0], [3.0], [4.0]]], dtype=torch.float32
+                ),  # input: 1x4x1 (NLC)
+                torch.tensor(
+                    [[[1.0], [1.0]]], dtype=torch.float32
+                ),  # weight: 1x2x1 (NLC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                True,  # channel_last
+                torch.tensor([[[3.0], [5.0], [7.0]]], dtype=torch.float32),
+            ),
+            # Test case 9: Multi-channel input and output
+            (
+                "multi_channel",
+                torch.tensor(
+                    [
+                        [
+                            [[1.0, 2.0], [3.0, 4.0]],  # first input channel
+                            [[0.5, 1.0], [1.5, 2.0]],  # second input channel
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # input: 1x2x2x2
+                torch.tensor(
+                    [
+                        [  # first output channel
+                            [[1.0, 0.0], [0.0, 1.0]],  # weights for first input channel
+                            [
+                                [2.0, 0.0],
+                                [0.0, 2.0],
+                            ],  # weights for second input channel
+                        ],
+                        [  # second output channel
+                            [[0.5, 0.5], [0.5, 0.5]],  # weights for first input channel
+                            [
+                                [1.0, 1.0],
+                                [1.0, 1.0],
+                            ],  # weights for second input channel
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),  # weight: 2x2x2x2
+                torch.tensor([0.0, 1.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                False,  # channel_last
+                torch.tensor([[[[10.0]], [[11.0]]]], dtype=torch.float32),
+            ),
+            # Test case 10: Convolution with non-zero bias
+            (
+                "conv2d_with_bias",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([10.0], dtype=torch.float32),  # bias=10
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                False,  # channel_last
+                torch.tensor(
+                    [[[[15.0]]]], dtype=torch.float32
+                ),  # expected: 5 + 10 = 15
+            ),
+        ]
+    )
+    def test_convolution(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: tuple[int, int],
+        padding: tuple[int, int],
+        dilation: tuple[int, int],
+        groups: int,
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.convolution(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            channel_last,
+        )
+
+        # Verify output properties
+        self.assertEqual(
+            output.dtype,
+            input_tensor.dtype,
+            f"Output dtype should match input dtype in {name}",
+        )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"Output shape should match expected shape in {name}",
+        )
+
+        # Verify output matches expected values
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            # Basic 2D transposed convolution with stride=1 (current test case - corrected name)
+            (
+                "basic_2d_stride1",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 1.0], [1.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                False,  # channel_last
+                torch.tensor(
+                    [[[[1.0, 3.0, 2.0], [4.0, 10.0, 6.0], [3.0, 7.0, 4.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2D transposed convolution with channel_last=True (NHWC format)
+            (
+                "channel_last_nhwc",
+                torch.tensor(
+                    [[[[1.0], [2.0]], [[3.0], [4.0]]]], dtype=torch.float32
+                ),  # input: 1x2x2x1 (NHWC)
+                torch.tensor(
+                    [[[[1.0], [1.0]], [[1.0], [1.0]]]], dtype=torch.float32
+                ),  # weight: 1x2x2x1 (NHWC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                True,  # channel_last=True
+                torch.tensor(
+                    [
+                        [
+                            [[1.0], [3.0], [2.0]],
+                            [[4.0], [10.0], [6.0]],
+                            [[3.0], [7.0], [4.0]],
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2D transposed convolution with non-zero bias
+            (
+                "with_bias",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                torch.tensor(
+                    [[[[1.0, 0.0], [0.0, 1.0]]]], dtype=torch.float32
+                ),  # weight: 1x1x2x2
+                torch.tensor([5.0], dtype=torch.float32),  # bias=5.0
+                (1, 1),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                False,  # channel_last
+                torch.tensor(
+                    [[[[6.0, 7.0, 5.0], [8.0, 10.0, 7.0], [5.0, 8.0, 9.0]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 1D transposed convolution (3D tensor, NLC format)
+            (
+                "conv1d_nlc",
+                torch.tensor(
+                    [[[1.0], [2.0], [3.0]]], dtype=torch.float32
+                ),  # input: 1x3x1 (NLC)
+                torch.tensor(
+                    [[[1.0], [0.5]]], dtype=torch.float32
+                ),  # weight: 1x2x1 (NLC)
+                torch.tensor([0.0], dtype=torch.float32),  # bias
+                (2, 0),  # stride
+                (0, 0),  # padding
+                (1, 1),  # dilation
+                1,  # groups
+                (0, 0),  # output_padding
+                True,  # channel_last=True
+                torch.tensor(
+                    [[[1.0], [0.5], [2.0], [1.0], [3.0], [1.5]]], dtype=torch.float32
+                ),
+            ),
+        ]
+    )
+    def test_transposed_convolution(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: tuple[int, int],
+        padding: tuple[int, int],
+        dilation: tuple[int, int],
+        groups: int,
+        output_padding: tuple[int, int],
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.transposed_convolution(
+            input_tensor,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            output_padding,
+            groups,
+            channel_last,
+        )
+
+        # Verify output properties
+        self.assertEqual(
+            output.dtype,
+            input_tensor.dtype,
+            f"Output dtype should match input dtype in {name}",
+        )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"Output shape should match expected shape in {name}",
+        )
+
+        # Verify output matches expected values
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            # Basic non-quantized average pooling
+            (
+                "basic_non_quantized",
+                torch.tensor(
+                    [
+                        [
+                            [
+                                [1.0, 2.0, 3.0, 4.0],
+                                [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0],
+                                [13.0, 14.0, 15.0, 16.0],
+                            ]
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # input: 1x1x4x4
+                (2, 2),  # kernel_size
+                (2, 2),  # stride
+                (0, 0),  # padding
+                False,  # ceil_mode
+                False,  # count_include_pad
+                None,  # divisor_override
+                None,  # in_zero_point (non-quantized)
+                False,  # channel_last
+                torch.tensor(
+                    [[[[3.5, 5.5], [11.5, 13.5]]]], dtype=torch.float32
+                ),  # expected: average of 2x2 blocks
+            ),
+            # Non-quantized with count_include_pad=True and padding
+            (
+                "non_quantized_count_include_pad",
+                torch.tensor(
+                    [[[[1.0, 2.0], [3.0, 4.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                (3, 3),  # kernel_size (larger than input)
+                (1, 1),  # stride
+                (1, 1),  # padding
+                False,  # ceil_mode
+                True,  # count_include_pad=True
+                None,  # divisor_override
+                None,  # in_zero_point (non-quantized)
+                False,  # channel_last
+                torch.tensor(
+                    [[[[2.5, 2.5], [2.5, 2.5]]]],
+                    dtype=torch.float32,
+                ),
+            ),
+            # Non-quantized with divisor_override
+            (
+                "non_quantized_divisor_override",
+                torch.tensor(
+                    [[[[2.0, 4.0], [6.0, 8.0]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                (2, 2),  # kernel_size
+                (1, 1),  # stride
+                (0, 0),  # padding
+                False,  # ceil_mode
+                False,  # count_include_pad
+                2,  # divisor_override (instead of 4)
+                None,  # in_zero_point (non-quantized)
+                False,  # channel_last
+                torch.tensor(
+                    [[[[10.0]]]], dtype=torch.float32
+                ),  # expected: (2+4+6+8)/2 = 10
+            ),
+            # Quantized with non-zero zero_point and padding
+            (
+                "quantized_nonzero_zero_point",
+                torch.tensor(
+                    [[[[130, 132], [134, 136]]]], dtype=torch.uint8
+                ),  # input: 1x1x2x2, values around zero_point=128
+                (3, 3),  # kernel_size
+                (1, 1),  # stride
+                (1, 1),  # padding
+                False,  # ceil_mode
+                True,  # count_include_pad=True
+                None,  # divisor_override
+                128,  # in_zero_point=128 (padded areas will have this value)
+                False,  # channel_last
+                torch.tensor(
+                    [[[[130, 130], [130, 130]]]], dtype=torch.uint8
+                ),  # expected: averages including padded zero_point values
+            ),
+            # Quantized with divisor_override
+            (
+                "quantized_divisor_override",
+                torch.tensor(
+                    [[[[64, 96], [128, 160]]]], dtype=torch.float32
+                ),  # input: 1x1x2x2
+                (2, 2),  # kernel_size
+                (1, 1),  # stride
+                (0, 0),  # padding
+                False,  # ceil_mode
+                False,  # count_include_pad
+                2,  # divisor_override (instead of 4)
+                None,  # in_zero_point=None
+                False,  # channel_last
+                torch.tensor(
+                    [[[[224]]]], dtype=torch.float32
+                ),  # expected: (64+96+128+160)/2 = 224
+            ),
+            # Large values that need clamping
+            (
+                "quantized_clamping_test",
+                torch.tensor(
+                    [[[[120, 125], [125, 127]]]], dtype=torch.int8
+                ),  # input: 1x1x2x2, large values for int8
+                (2, 2),  # kernel_size
+                (1, 1),  # stride
+                (0, 0),  # padding
+                False,  # ceil_mode
+                False,  # count_include_pad
+                None,  # divisor_override
+                0,  # in_zero_point=0
+                False,  # channel_last
+                torch.tensor(
+                    [[[[124]]]], dtype=torch.int8
+                ),  # expected: (120+125+125+127)/4 = 124.25 -> 124, within int8 range
+            ),
+        ]
+    )
+    def test_avg_pool2d(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        stride: tuple[int, int],
+        padding: tuple[int, int],
+        ceil_mode: bool,
+        count_include_pad: bool,
+        divisor_override: int | None,
+        in_zero_point: int | None,
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.avg_pool2d(
+            input_tensor,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+            in_zero_point if in_zero_point is None else torch.tensor([in_zero_point]),
+            channel_last,
+        )
+
+        # Verify output properties
+        self.assertEqual(
+            output.dtype,
+            input_tensor.dtype,
+            f"Output dtype should match input dtype in {name}",
+        )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"Output shape should match expected shape in {name}",
+        )
+
+        # Verify output matches expected values
+        if input_tensor.dtype.is_floating_point:
+            self.assertTrue(
+                torch.allclose(output, expected_output, rtol=1e-4, atol=1e-4),
+                f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+            )
+        else:
+            self.assertTrue(
+                torch.equal(output, expected_output),
+                f"Output values don't match expected in {name}. Got {output}, expected {expected_output}",
+            )
+
+    @expand(
+        [
+            # Basic 2x2 kernel, stride 1, no padding, NCHW
+            (
+                "nchw_basic_2x2",
+                torch.tensor(
+                    [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+                ),  # (N=1, C=1, H=3, W=3)
+                (2, 2),  # kernel_size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (1, 1),  # stride
+                None,  # in_zero_point
+                False,  # channel_last
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 2, no padding, NCHW
+            (
+                "nchw_stride2",
+                torch.tensor(
+                    [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]], dtype=torch.float32
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (2, 2),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5]],
+                    ],
+                    dtype=torch.float32,  # Only every other patch in each dim
+                ),
+            ),
+            # 2x2 kernel, stride 1, padding 1, NCHW
+            (
+                "nchw_padding1",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.float32),  # (1,1,2,2)
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (1, 1),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [0, 0, 0, 1],
+                            [0, 0, 1, 2],
+                            [0, 0, 2, 0],
+                            [0, 1, 0, 3],
+                            [1, 2, 3, 4],
+                            [2, 0, 4, 0],
+                            [0, 3, 0, 0],
+                            [3, 4, 0, 0],
+                            [4, 0, 0, 0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NHWC
+            (
+                "nhwc_basic_2x2",
+                torch.tensor(
+                    [[[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]]],
+                    dtype=torch.float32,
+                ),  # (N=1, H=3, W=3, C=1)
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                None,
+                True,
+                False,
+                torch.tensor(
+                    [
+                        [[1, 2, 4, 5], [2, 3, 5, 6], [4, 5, 7, 8], [5, 6, 8, 9]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NCHW, in_zero_point=1
+            (
+                "nchw_in_zero_point_no_padding",
+                torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor(1, dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [[2, 3, 5, 6], [3, 4, 6, 7], [5, 6, 8, 9], [6, 7, 9, 10]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            (
+                "nchw_in_zero_point_with_padding=1_and_stride=2",
+                torch.tensor([[[[2, 3, 4], [5, 6, 7], [8, 9, 10]]]], dtype=torch.int8),
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (2, 2),
+                torch.tensor(-1, dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [-1, -1, -1, 2],
+                            [-1, -1, 3, 4],
+                            [-1, 5, -1, 8],
+                            [6, 7, 9, 10],
+                        ],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            # 2x2 kernel, stride 1, no padding, NHWC, in_zero_point=2
+            (
+                "nhwc_in_zero_point",
+                torch.tensor(
+                    [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+                    dtype=torch.int8,
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor(2, dtype=torch.int32),
+                True,
+                False,
+                torch.tensor(
+                    [
+                        [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+            # Multi-channel input, 2x2 kernel, stride 1, no padding, NCHW
+            (
+                "nchw_multi_channel",
+                torch.tensor(
+                    [
+                        [
+                            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],  # channel 0
+                            [[10, 11, 12], [13, 14, 15], [16, 17, 18]],  # channel 1
+                        ]
+                    ],
+                    dtype=torch.float32,
+                ),  # (1,2,3,3)
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                None,
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 2, 4, 5, 10, 11, 13, 14],
+                            [2, 3, 5, 6, 11, 12, 14, 15],
+                            [4, 5, 7, 8, 13, 14, 16, 17],
+                            [5, 6, 8, 9, 14, 15, 17, 18],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            # Multi-channel input and multi-channel zero-point
+            (
+                "nchw_multi_channel_and_zero_point_no_padding",
+                torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+                (1, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                torch.tensor([-1, -2], dtype=torch.int32),
+                False,
+                False,
+                torch.tensor([[[1, 2], [2, 3]], [[4, 5], [5, 6]]], dtype=torch.int32),
+            ),
+            (
+                "nchw_multi_channel_and_zero_point_with_padding=1_and_stride=(2, 1)",
+                torch.tensor([[[1, 2, 3]], [[4, 5, 6]]], dtype=torch.int32),
+                (1, 2),
+                (1, 1),
+                (2, 1),
+                (2, 2),
+                torch.tensor([-1, -2], dtype=torch.int32),
+                False,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [-1, -1],
+                            [-1, -1],
+                            [-1, 1],
+                            [2, 3],
+                            [-1, -1],
+                            [-1, -1],
+                        ],
+                        [
+                            [-2, -2],
+                            [-2, -2],
+                            [-2, 4],
+                            [5, 6],
+                            [-2, -2],
+                            [-2, -2],
+                        ],
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "per_tensor",
+                torch.tensor(
+                    [[[[3], [4], [5]], [[6], [7], [8]], [[9], [10], [11]]]],
+                    dtype=torch.int8,
+                ),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                2,
+                True,
+                True,
+                torch.tensor(
+                    [
+                        [[3, 4, 6, 7], [4, 5, 7, 8], [6, 7, 9, 10], [7, 8, 10, 11]],
+                    ],
+                    dtype=torch.int8,
+                ),
+            ),
+        ]
+    )
+    def test_im2row(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        dilation: tuple[int, int],
+        padding: tuple[int, int],
+        stride: tuple[int, int],
+        in_zero_point: torch.Tensor | None,
+        channel_last: bool,
+        per_tensor: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        if per_tensor:
+            output = torch.ops.cadence.im2row.per_tensor(
+                input_tensor,
+                kernel_size,
+                dilation,
+                padding,
+                stride,
+                in_zero_point,
+                channel_last,
+            )
+        else:
+            output = torch.ops.cadence.im2row(
+                input_tensor,
+                kernel_size,
+                dilation,
+                padding,
+                stride,
+                in_zero_point,
+                channel_last,
+            )
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"im2row output shape mismatch in {name}",
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"im2row output mismatch in {name}: got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            (
+                "basic_2x2",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                (0, 0),
+                None,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [1, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [1, 0, 3, 0],
+                            [1, 2, 3, 4],
+                            [0, 2, 0, 4],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 4],
+                            [0, 0, 0, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "basic_2x2_with_zero_point",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),
+                (1, 1),
+                (0, 0),
+                (1, 1),
+                (0, 0),
+                torch.tensor(100, dtype=torch.int32),
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 100, 100, 100],
+                            [1, 2, 100, 100],
+                            [100, 2, 100, 100],
+                            [1, 100, 3, 100],
+                            [1, 2, 3, 4],
+                            [100, 2, 100, 4],
+                            [100, 100, 3, 100],
+                            [100, 100, 3, 4],
+                            [100, 100, 100, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "basic_2x2_with_stride_2",
+                torch.tensor([[[[1, 2], [3, 4]]]], dtype=torch.int32),
+                (2, 2),  # kernel size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (2, 2),  # stride
+                (0, 0),  # output padding
+                None,
+                False,
+                torch.tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [1, 0, 0, 0],
+                            [1, 0, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 2, 0, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 0, 4],
+                            [0, 0, 0, 4],
+                            [0, 0, 3, 0],
+                            [0, 0, 3, 0],
+                            [0, 0, 0, 4],
+                            [0, 0, 0, 4],
+                        ]
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+            (
+                "batch2_with_batch2_zero_point",
+                torch.tensor(
+                    [
+                        [[[1, 2], [3, 4]]],
+                        [[[5, 6], [7, 8]]],
+                    ],
+                    dtype=torch.int32,
+                ),  # input: (2,1,2,2)
+                (2, 2),  # kernel_size
+                (1, 1),  # dilation
+                (0, 0),  # padding
+                (1, 1),  # stride
+                (0, 0),  # output_padding
+                torch.tensor([100, 200], dtype=torch.int32),  # in_zero_point per batch
+                False,  # channel_last
+                torch.tensor(
+                    [
+                        [
+                            [1, 100, 100, 100],
+                            [1, 2, 100, 100],
+                            [100, 2, 100, 100],
+                            [1, 100, 3, 100],
+                            [1, 2, 3, 4],
+                            [100, 2, 100, 4],
+                            [100, 100, 3, 100],
+                            [100, 100, 3, 4],
+                            [100, 100, 100, 4],
+                        ],
+                        [
+                            [5, 200, 200, 200],
+                            [5, 6, 200, 200],
+                            [200, 6, 200, 200],
+                            [5, 200, 7, 200],
+                            [5, 6, 7, 8],
+                            [200, 6, 200, 8],
+                            [200, 200, 7, 200],
+                            [200, 200, 7, 8],
+                            [200, 200, 200, 8],
+                        ],
+                    ],
+                    dtype=torch.int32,
+                ),
+            ),
+        ]
+    )
+    def test_transposed_im2row(
+        self,
+        name: str,
+        input_tensor: torch.Tensor,
+        kernel_size: tuple[int, int],
+        dilation: tuple[int, int],
+        padding: tuple[int, int],
+        stride: tuple[int, int],
+        output_padding: tuple[int, int],
+        in_zero_point: torch.Tensor | int | None,
+        channel_last: bool,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = torch.ops.cadence.transposed_im2row(
+            input_tensor,
+            kernel_size,
+            dilation,
+            padding,
+            stride,
+            output_padding,
+            in_zero_point,
+            channel_last,
+        )
+
+        self.assertEqual(
+            output.shape,
+            expected_output.shape,
+            f"transposed_im2row output shape mismatch in {name}: got {output.shape}, expected {expected_output.shape}",
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"transposed_im2row output mismatch in {name}: got {output}, expected {expected_output}",
+        )
+
+    @expand(
+        [
+            (
+                "1_group",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [0.0, 0.5, 1.0, 2.0],
+                        [10.0, 12.5, 15.0, 18.0],
+                        [3.0, 4.5, 6.0, 8.0],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([0, 2, 1], dtype=torch.int64),
+                torch.tensor(
+                    [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                torch.tensor([0, 0, 0], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "2_groups_batch2",
+                torch.tensor(
+                    [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], dtype=torch.int8
+                ),
+                torch.tensor([[0.5, 1.0], [1.5, 2.0], [2.5, 3.0]], dtype=torch.float32),
+                torch.tensor([[0, 1], [2, 3], [4, 5]], dtype=torch.int8),
+                torch.tensor([[0, 2, 1], [2, 1, 0]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [
+                            [0.0, 0.5, 1.0, 2.0],
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                        ],
+                        [
+                            [10.0, 12.5, 15.0, 18.0],
+                            [3.0, 4.5, 6.0, 8.0],
+                            [0.0, 0.5, 1.0, 2.0],
+                        ],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+            (
+                "1_group_none_zero_point_batch2",
+                torch.tensor([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=torch.int8),
+                torch.tensor([1, 1, 1], dtype=torch.float32),
+                None,
+                torch.tensor([[0, 2, 1], [1, 0, 2]], dtype=torch.int64),
+                torch.tensor(
+                    [
+                        [[0.0, 1.0, 2.0], [6.0, 7.0, 8.0], [3.0, 4.0, 5.0]],
+                        [[3.0, 4.0, 5.0], [0.0, 1.0, 2.0], [6.0, 7.0, 8.0]],
+                    ],
+                    dtype=torch.float32,
+                ),
+            ),
+        ]
+    )
+    def test_quantized_embedding_byte(
+        self,
+        name: str,
+        weight: torch.Tensor,
+        weight_scales: torch.Tensor,
+        weight_zero_points: torch.Tensor | None,
+        indices: torch.Tensor,
+        expected_out: torch.Tensor,
+    ) -> None:
+        self.assertTrue(
+            torch.equal(
+                torch.ops.cadence.quantized_embedding_byte(
+                    weight, weight_scales, weight_zero_points, indices
+                ),
+                expected_out,
+            )
+        )
diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py
index ca5168db2be..e2fbd516757 100644
--- a/backends/cadence/aot/tests/test_replace_ops_passes.py
+++ b/backends/cadence/aot/tests/test_replace_ops_passes.py
@@ -45,6 +45,7 @@
     ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
     ReplaceSplitWithSlicePass,
     ReplaceSqueezeAndUnsqueezeWithViewPass,
+    ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding,
     ReplaceTransposedConvWithLinearPass,
     ReplaceTrivialConvWithLinear,
     ReplaceWhereWithFullArgsWithWhereScalar,
@@ -52,9 +53,10 @@
 
 from executorch.backends.cadence.aot.typing_stubs import expand
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass
+from executorch.exir.pass_base import ExportPass, ProxyValue
 from executorch.exir.passes import dead_code_elimination_pass
 from torch.fx.passes.infra.pass_base import PassResult
+from torch.utils import _pytree as pytree
 
 
 class TestReplaceOpsPasses(unittest.TestCase):
@@ -345,6 +347,194 @@ def test_replace_functionally_equivalent_op_targets_unsafe_split(
             count_node(graph_after_passes, exir_ops.edge.aten.unsafe_split.Tensor), 0, x
         )
 
+    def assertTensorMetadataIsSame(
+        self, a: Sequence[torch.Tensor], b: Sequence[torch.Tensor]
+    ) -> None:
+        for i, (_a, _b) in enumerate(zip(a, b)):
+            # TODO: actually compare the tensors.
+            self.assertTrue(
+                _a.shape == _b.shape, f"Tensor {i}: {_a.shape} != {_b.shape}"
+            )
+            self.assertTrue(
+                _a.dtype == _b.dtype, f"Tensor {i}: {_a.dtype} != {_b.dtype}"
+            )
+
+    @expand(
+        [
+            [(1, 8, 18), 8, 16, 3],
+            [(1, 8, 18), 8, 16, 5, 2],
+            # depthwise + bias
+            [(1, 8, 18), 8, 16, 5, 2, 0, 1, True],
+            # no bias
+            [(1, 8, 18), 8, 16, 3, 2, 4, 3, False, False],
+            # bias + transposed
+            [(1, 8, 18), 8, 16, 5, 2, 0, 1, False, True],
+            # Stride of 2 needed.
+            [(1, 8, 3), 8, 8, 48, 2, 23],
+        ]
+    )
+    @torch.no_grad()
+    def test_replace_aten_conv_with_cadence_conv(
+        self,
+        shape: Tuple[int, ...],
+        in_channels: int,
+        out_channels: int,
+        kernel: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        depthwise: bool = False,
+        bias_enabled: bool = True,
+        output_padding: Optional[int] = None,
+    ) -> None:
+        groups = in_channels if depthwise else 1
+        builder = GraphBuilder()
+        x_tensor = torch.randn(*shape, dtype=torch.float32)
+        x = builder.placeholder("x", x_tensor)
+        weights_tensor = torch.randn(
+            [out_channels, in_channels // groups, kernel], dtype=torch.float32
+        )
+        weights = builder.placeholder("weights", weights_tensor)
+        bias: Optional[ProxyValue] = None
+        bias_tensor: Optional[torch.Tensor] = None
+        if bias_enabled:
+            bias_tensor = torch.randn([out_channels], dtype=torch.float32)
+            bias = builder.placeholder("bias", bias_tensor)
+        convolution = builder.call_operator(
+            op=exir_ops.edge.aten.convolution.default,
+            args=(
+                x,
+                weights,
+                bias,
+                [stride],
+                [padding],
+                [dilation],
+                False,
+                [output_padding] if output_padding else [0],
+                groups,
+            ),
+        )
+        builder.output([convolution])
+        original_gm = builder.get_graph_module()
+
+        replacement_pass_result = (
+            ReplaceAtenConvolutionWithCadenceConvolutionPass().call(original_gm)
+        )
+        self.assertIsNotNone(replacement_pass_result)
+        graph_after_passes = replacement_pass_result.graph_module
+
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.convolution.default),
+            0,
+        )
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.cadence.convolution.default),
+            1,
+        )
+        self.assertEqual(
+            count_node(
+                graph_after_passes, exir_ops.edge.cadence.transposed_convolution.default
+            ),
+            0,
+        )
+
+        inputs = (x.to_tensor(), weights.to_tensor())
+        if bias is not None:
+            inputs += (bias.to_tensor(),)
+        self.assertTensorMetadataIsSame(
+            pytree.tree_flatten(original_gm.forward(*inputs))[0],
+            pytree.tree_flatten(graph_after_passes.forward(*inputs))[0],
+        )
+
+    @expand(
+        [
+            [(1, 8, 18), 8, 16, 3],
+            [(1, 8, 18), 8, 16, 5, 2],
+            # depthwise + bias
+            [(1, 8, 18), 8, 16, 5, 2, 0, 1, True, True],
+            # no bias
+            [(1, 8, 18), 8, 16, 3, 2, 4, 3, False, False],
+            # depthwise + no bias
+            [(1, 8, 18), 8, 16, 3, 1, 0, 1, True, False],
+            # bias
+            [(1, 8, 18), 8, 16, 5, 2, 0, 1, False, True],
+        ]
+    )
+    @torch.no_grad()
+    def test_replace_aten_transposed_conv_with_cadence_transposed_conv(
+        self,
+        shape: Tuple[int, ...],
+        in_channels: int,
+        out_channels: int,
+        kernel: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        depthwise: bool = False,
+        bias_enabled: bool = True,
+        output_padding: Optional[int] = None,
+    ) -> None:
+        groups = in_channels if depthwise else 1
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(*shape, dtype=torch.float32))
+        weights_shape = [in_channels, out_channels // groups, kernel]
+        weights = builder.placeholder(
+            "weights",
+            torch.randn(weights_shape, dtype=torch.float32),
+        )
+        bias = (
+            builder.placeholder(
+                "bias", torch.randn([out_channels], dtype=torch.float32)
+            )
+            if bias_enabled
+            else None
+        )
+        convolution = builder.call_operator(
+            op=exir_ops.edge.aten.convolution.default,
+            args=(
+                x,
+                weights,
+                bias,
+                [stride],
+                [padding],
+                [dilation],
+                True,
+                [output_padding] if output_padding else [0],
+                groups,
+            ),
+        )
+        builder.output([convolution])
+        original_gm = builder.get_graph_module()
+
+        replacement_pass_result = (
+            ReplaceAtenConvolutionWithCadenceConvolutionPass().call(original_gm)
+        )
+        self.assertIsNotNone(replacement_pass_result)
+        graph_after_passes = replacement_pass_result.graph_module
+
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.aten.convolution.default),
+            0,
+        )
+        self.assertEqual(
+            count_node(graph_after_passes, exir_ops.edge.cadence.convolution.default),
+            0,
+        )
+        self.assertEqual(
+            count_node(
+                graph_after_passes, exir_ops.edge.cadence.transposed_convolution.default
+            ),
+            1,
+        )
+
+        inputs = (x.to_tensor(), weights.to_tensor())
+        if bias is not None:
+            inputs += (bias.to_tensor(),)
+        self.assertTensorMetadataIsSame(
+            pytree.tree_flatten(original_gm.forward(*inputs))[0],
+            pytree.tree_flatten(graph_after_passes.forward(*inputs))[0],
+        )
+
     @expand(
         [
             [(1, 8, 33), 8, 16, 3],
@@ -455,8 +645,6 @@ def test_replace_convolution_optional_args_with_concrete_args(
         bias_enabled: bool = True,
         channel_last: bool = False,
     ) -> None:
-        transposed = True
-        output_padding = [0]
         groups = in_channels if depthwise else 1
         builder = GraphBuilder()
         x = builder.placeholder("x", torch.randn(*shape, dtype=torch.float32))
@@ -477,7 +665,7 @@ def test_replace_convolution_optional_args_with_concrete_args(
                 args=(x, [0, 2, 1]),
             )
         convolution = builder.call_operator(
-            op=exir_ops.edge.aten.convolution.default,
+            op=exir_ops.edge.cadence.convolution.default,
             args=(
                 x,
                 weights,
@@ -485,9 +673,8 @@ def test_replace_convolution_optional_args_with_concrete_args(
                 [stride],
                 [padding],
                 [dilation],
-                transposed,
-                output_padding,
                 groups,
+                False,
             ),
         )
         if channel_last:
@@ -504,7 +691,7 @@ def test_replace_convolution_optional_args_with_concrete_args(
             1,
         )
         self.assertEqual(
-            count_node(graph_after_passes, exir_ops.edge.aten.convolution.default),
+            count_node(graph_after_passes, exir_ops.edge.cadence.convolution.default),
             1,
         )
 
@@ -1666,7 +1853,7 @@ def create_quantized_convolution_graph_module(
                     out_multiplier,
                     out_shift,
                 ),
-                op=exir_ops.edge.cadence.quantized_conv_nhwc.default,
+                op=exir_ops.edge.cadence.quantized_conv2d_nhwc.default,
                 args=args,
             )
         else:
@@ -1680,7 +1867,7 @@ def create_quantized_convolution_graph_module(
                     out_multiplier,
                     out_shift,
                 ),
-                op=exir_ops.edge.cadence.quantized_conv_nchw.default,
+                op=exir_ops.edge.cadence.quantized_conv2d_nchw.default,
                 args=args,
             )
 
@@ -1688,7 +1875,7 @@ def test_quantized_convolution_default_channel_last(self) -> None:
         # Create a graph with a single convolution node.
         gm = self.create_quantized_convolution_graph_module()
         self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nchw.default), 1
+            count_node(gm, exir_ops.edge.cadence.quantized_conv2d_nchw.default), 1
         )
         self.assertEqual(count_node(gm, exir_ops.edge.aten.permute_copy.default), 0)
 
@@ -1698,7 +1885,8 @@ def test_quantized_convolution_default_channel_last(self) -> None:
         # Check that no replacement was made.
         self.assertEqual(
             count_node(
-                gm_after_replacement, exir_ops.edge.cadence.quantized_conv_nhwc.default
+                gm_after_replacement,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.default,
             ),
             1,
         )
@@ -1714,7 +1902,7 @@ def test_no_transpose_if_already_quantized_conv_channel_last(self) -> None:
         # Check if graph module is valid by running exportpass on it.
         gm = ExportPass().call(gm).graph_module
         self.assertEqual(
-            count_node(gm, exir_ops.edge.cadence.quantized_conv_nhwc.default), 1
+            count_node(gm, exir_ops.edge.cadence.quantized_conv2d_nhwc.default), 1
         )
 
         # Apply replacement pass.
@@ -1723,7 +1911,8 @@ def test_no_transpose_if_already_quantized_conv_channel_last(self) -> None:
         # Check that no replacement was made.
         self.assertEqual(
             count_node(
-                gm_after_replacement, exir_ops.edge.cadence.quantized_conv_nhwc.default
+                gm_after_replacement,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.default,
             ),
             1,
         )
@@ -2081,3 +2270,48 @@ def test_replace_aten_linalg_svd_with_cadence_linalg_svd(
             count_node(graph_after_passes, exir_ops.edge.cadence.linalg_svd.default),
             1,
         )
+
+    @expand([("dtype",), ("default",)])
+    @torch.no_grad()
+    def test_replace_quantized_embedding(
+        self,
+        name: str,
+    ) -> None:
+        embedding = torch.ones(5, 6, dtype=torch.int8)
+        indices = torch.tensor([0, 2], dtype=torch.int32)
+        scales = torch.ones(5, 2, dtype=torch.float32)
+        zero_points = None
+
+        original_gm = single_op_builder(
+            placeholders=(embedding, scales, indices),
+            op=(
+                exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+                if name == "dtype"
+                else exir_ops.edge.quantized_decomposed.embedding_byte.default
+            ),
+            args=(embedding, scales, zero_points, -128, 127, indices),
+            kwargs={"dtype": torch.float32} if name == "dtype" else {},
+        )
+
+        p = ReplaceTorchQuantizedEmbeddingWithCadenceQuantizedEmbedding()
+        graph_after_passes = cast(PassResult, p(original_gm)).graph_module
+
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                (
+                    exir_ops.edge.quantized_decomposed.embedding_byte.dtype
+                    if name == "dtype"
+                    else exir_ops.edge.quantized_decomposed.embedding_byte.default
+                ),
+            ),
+            0,
+        )
+
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.cadence.quantized_embedding_byte.default,
+            ),
+            1,
+        )
diff --git a/backends/cadence/aot/tests/test_type_dispatch_passes.py b/backends/cadence/aot/tests/test_type_dispatch_passes.py
index 4ae10ea83dd..870735aad1a 100644
--- a/backends/cadence/aot/tests/test_type_dispatch_passes.py
+++ b/backends/cadence/aot/tests/test_type_dispatch_passes.py
@@ -199,29 +199,29 @@ def test_dispatch_quantized_matmul(
                 "int8_nchw",
                 torch.int8,
                 (1, 3, 8, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nchw_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nchw",
                 torch.uint8,
                 (1, 3, 8, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nchw_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw_asym8uxsym8u_asym8u.per_tensor,
             ),
             (
                 "int8_nhwc",
                 torch.int8,
                 (1, 8, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nhwc_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nhwc",
                 torch.uint8,
                 (1, 8, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nhwc_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc_asym8uxsym8u_asym8u.per_tensor,
             ),
         ]
     )
@@ -256,29 +256,29 @@ def test_dispatch_quantized_conv_2d(
                 "int8_nchw_dilated",
                 torch.int8,
                 (1, 3, 8, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nchw_dilated",
                 torch.uint8,
                 (1, 3, 8, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nchw_dilated_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u.per_tensor,
             ),
             (
                 "int8_nhwc_dilated",
                 torch.int8,
                 (1, 8, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nhwc_dilated",
                 torch.uint8,
                 (1, 8, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor,
             ),
         ]
     )
@@ -313,29 +313,29 @@ def test_dispatch_quantized_conv_2d_dilated(
                 "int8_nchw_1d",
                 torch.int8,
                 (1, 3, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_nchw_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nchw_1d",
                 torch.uint8,
                 (1, 3, 8),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_nchw_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor,
             ),
             (
                 "int8_nhwc_1d",
                 torch.int8,
                 (1, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nhwc_1d",
                 torch.uint8,
                 (1, 8, 3),  # x_shape
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv1d_nhwc_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor,
             ),
         ]
     )
@@ -410,32 +410,32 @@ def test_dispatch_quantized_add(
                 torch.int8,
                 (1, 3, 8, 8),  # x_shape
                 (3, 1, 3, 3),  # w_shape (groups=3, input_channels=3)
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nchw_depthwise",
                 torch.uint8,
                 (1, 3, 8, 8),  # x_shape
                 (3, 1, 3, 3),  # w_shape (groups=3, input_channels=3)
-                exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u.per_tensor,
             ),
             (
                 "int8_nhwc_depthwise",
                 torch.int8,
                 (1, 8, 8, 3),  # x_shape
                 (3, 3, 3, 1),  # w_shape (groups=3, input_channels=3)
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s.per_tensor,
             ),
             (
                 "uint8_nhwc_depthwise",
                 torch.uint8,
                 (1, 8, 8, 3),  # x_shape
                 (3, 3, 3, 1),  # w_shape (groups=3, input_channels=3)
-                exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
-                exir_ops.edge.cadence.quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
+                exir_ops.edge.cadence.quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u.per_tensor,
             ),
         ]
     )
diff --git a/backends/cadence/aot/type_dispatch.py b/backends/cadence/aot/type_dispatch.py
index 958a78a4808..37f753767e9 100644
--- a/backends/cadence/aot/type_dispatch.py
+++ b/backends/cadence/aot/type_dispatch.py
@@ -27,6 +27,7 @@ class OpConfig:
     base_name: str
     type_dispatch_suffixes: dict[tuple[torch.dtype, ...], str]
     weight_arg_idx: Optional[int] = None
+    is_quant_op: bool = False
     variant: str = "per_tensor"
 
 
@@ -62,16 +63,16 @@ class CompileTimeTypeDispatchPass(ExportPass):
             weight_arg_idx=2,
             variant="default",
         ),
-        exir_ops.edge.cadence.quantized_conv_nchw.per_tensor: OpConfig(
-            "quantized_conv_nchw",
+        exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor: OpConfig(
+            "quantized_conv2d_nchw",
             type_dispatch_suffixes={
                 (torch.int8, torch.int8): "asym8sxsym8s_asym8s",
                 (torch.uint8, torch.uint8): "asym8uxsym8u_asym8u",
             },
             weight_arg_idx=1,
         ),
-        exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor: OpConfig(
-            "quantized_conv_nhwc",
+        exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor: OpConfig(
+            "quantized_conv2d_nhwc",
             type_dispatch_suffixes={
                 (torch.int8, torch.int8): "asym8sxsym8s_asym8s",
                 (torch.uint8, torch.uint8): "asym8uxsym8u_asym8u",
@@ -100,6 +101,29 @@ class CompileTimeTypeDispatchPass(ExportPass):
             },
             variant="default",
         ),
+        exir_ops.edge.cadence.quantize_per_tensor.default: OpConfig(
+            "quantize_per_tensor",
+            type_dispatch_suffixes={
+                (torch.int8,): "asym8s",
+                (torch.uint8,): "asym8u",
+                (torch.int16,): "asym16s",
+                (torch.uint16,): "asym16s",
+                (torch.int32,): "asym32s",
+            },
+            variant="default",
+            is_quant_op=True,
+        ),
+        exir_ops.edge.cadence.dequantize_per_tensor.default: OpConfig(
+            "dequantize_per_tensor",
+            type_dispatch_suffixes={
+                (torch.int8,): "asym8s",
+                (torch.uint8,): "asym8u",
+                (torch.int16,): "asym16s",
+                (torch.uint16,): "asym16s",
+                (torch.int32,): "asym32s",
+            },
+            variant="default",
+        ),
     }
 
     def call_operator(
@@ -120,6 +144,8 @@ def call_operator(
         if config.weight_arg_idx is not None:
             weight_dtype = args[config.weight_arg_idx].to_tensor().dtype
             dtype_key = (input_dtype, weight_dtype)
+        elif config.is_quant_op:
+            dtype_key = (args[5],)
         else:
             dtype_key = (input_dtype,)
 
@@ -132,13 +158,13 @@ def call_operator(
         typed_op_name = f"{base_name}_{type_suffix}"
 
         if op in [
-            exir_ops.edge.cadence.quantized_conv_nchw.per_tensor,
-            exir_ops.edge.cadence.quantized_conv_nhwc.per_tensor,
+            exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
+            exir_ops.edge.cadence.quantized_conv2d_nhwc.per_tensor,
         ]:
             groups = args[6]
             input_channels = (
                 args[0].to_tensor().shape[1]
-                if op == exir_ops.edge.cadence.quantized_conv_nchw.per_tensor
+                if op == exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor
                 else args[0].to_tensor().shape[-1]
             )
             is_depthwise = groups == input_channels
@@ -151,9 +177,11 @@ def call_operator(
             elif is_dilated:
                 typed_op_name = f"{base_name}_dilated_{type_suffix}"
             elif is_1d and groups == 1:
-                typed_op_name = (
-                    f"quantized_conv1d_{base_name.split('_')[-1]}_{type_suffix}"
-                )
+                if "nchw" in base_name:
+                    layout_suffix = "ncl"
+                else:
+                    layout_suffix = "nlc"
+                typed_op_name = f"quantized_conv1d_{layout_suffix}_{type_suffix}"
 
         typed_op = getattr(
             getattr(exir_ops.edge.cadence, typed_op_name), config.variant
diff --git a/backends/cadence/build_cadence_vision.sh b/backends/cadence/build_cadence_vision.sh
new file mode 100755
index 00000000000..7c2c6d68860
--- /dev/null
+++ b/backends/cadence/build_cadence_vision.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+unset CMAKE_PREFIX_PATH
+unset XTENSA_CORE
+export XTENSA_CORE=XRC_Vision_130_AO
+git submodule sync
+git submodule update --init --recursive
+./install_requirements.sh
+./install_executorch.sh
+
+rm -rf cmake-out
+
+STEPWISE_BUILD=false
+
+if $STEPWISE_BUILD; then
+    echo "Building ExecuTorch"
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake  \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=OFF \
+        -Bcmake-out .
+
+    echo "Building any Cadence-specific binaries on top"
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_LLM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_VISION_OPT=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out/backends/cadence \
+        backends/cadence
+    cmake --build cmake-out/backends/cadence  -j8
+else
+    echo "Building Cadence toolchain with ExecuTorch packages"
+    cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+    CXXFLAGS="-fno-exceptions -fno-rtti" cmake -DBUCK2="$BUCK" \
+        -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DEXECUTORCH_BUILD_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_LLM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_VISION_OPT=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out
+    cmake --build cmake-out --target install --config Release -j8
+fi
+
+echo "Run simple model to verify cmake build"
+python3 -m examples.portable.scripts.export --model_name="add"
+xt-run --turbo cmake-out/executor_runner  --model_path=add.pte
diff --git a/backends/cadence/generic/kernels/kernels.cpp b/backends/cadence/generic/kernels/kernels.cpp
index 568d8468af9..25e25cfa60a 100644
--- a/backends/cadence/generic/kernels/kernels.cpp
+++ b/backends/cadence/generic/kernels/kernels.cpp
@@ -73,6 +73,7 @@ typed_quantize_val(int8_t);
 typed_quantize_val(uint8_t);
 typed_quantize_val(int16_t);
 typed_quantize_val(uint16_t);
+typed_quantize_val(int32_t);
 #undef typed_quantize_val
 
 #define typed_quantize_vec(dtype)  \
@@ -86,6 +87,7 @@ typed_quantize_vec(int8_t);
 typed_quantize_vec(uint8_t);
 typed_quantize_vec(int16_t);
 typed_quantize_vec(uint16_t);
+typed_quantize_vec(int32_t);
 #undef typed_quantize_vec
 
 #define typed_dequantize_val(dtype) \
@@ -94,6 +96,7 @@ typed_dequantize_val(int8_t);
 typed_dequantize_val(uint8_t);
 typed_dequantize_val(int16_t);
 typed_dequantize_val(uint16_t);
+typed_dequantize_val(int32_t);
 #undef typed_dequantize_val
 
 #define typed_dequantize_vec(dtype) \
@@ -107,6 +110,7 @@ typed_dequantize_vec(int8_t);
 typed_dequantize_vec(uint8_t);
 typed_dequantize_vec(int16_t);
 typed_dequantize_vec(uint16_t);
+typed_dequantize_vec(int32_t);
 #undef typed_dequantize_vec
 
 } // namespace kernels
diff --git a/backends/cadence/generic/operators/CMakeLists.txt b/backends/cadence/generic/operators/CMakeLists.txt
index ea5b699f441..63d8902ac89 100644
--- a/backends/cadence/generic/operators/CMakeLists.txt
+++ b/backends/cadence/generic/operators/CMakeLists.txt
@@ -16,10 +16,6 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
-    "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/op_embedding.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp"
-    "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -31,10 +27,13 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp"
@@ -58,6 +57,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp"
@@ -80,15 +80,15 @@ target_include_directories(
 add_library(
   custom_ops
   "quantized_linear_out.cpp"
-  "quantized_conv_nchw_out.cpp"
-  "quantized_conv_nhwc_out.cpp"
+  "quantized_conv2d_nchw_out.cpp"
+  "quantized_conv2d_nhwc_out.cpp"
   "quantized_relu_out.cpp"
   "quantized_layer_norm.cpp"
   "quantize_per_tensor.cpp"
   "quantized_fully_connected_out.cpp"
   "dequantize_per_tensor.cpp"
   "quantized_matmul_out.cpp"
-  "requantize_out.cpp"
+  "op_requantize_out.cpp"
   "im2row_out.cpp"
 )
 target_include_directories(
diff --git a/backends/cadence/generic/operators/dequantize_per_tensor.cpp b/backends/cadence/generic/operators/dequantize_per_tensor.cpp
index 1481981ee0b..ec05272da1b 100644
--- a/backends/cadence/generic/operators/dequantize_per_tensor.cpp
+++ b/backends/cadence/generic/operators/dequantize_per_tensor.cpp
@@ -18,7 +18,7 @@ using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
 using ::impl::generic::kernels::dequantize;
 
-void dequantize_per_tensor_out(
+Tensor& dequantize_per_tensor_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
@@ -44,12 +44,96 @@ void dequantize_per_tensor_out(
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
     dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Int) {
+    const int32_t* input_data = input.const_data_ptr<int32_t>();
+    dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
         false,
         "Unhandled input dtype %hhd",
         static_cast<int8_t>(input.scalar_type()));
   }
+  return out;
+}
+
+Tensor& dequantize_per_tensor_asym8s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int8_t* input_data = input.const_data_ptr<int8_t>();
+  dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+Tensor& dequantize_per_tensor_asym8u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const uint8_t* input_data = input.const_data_ptr<uint8_t>();
+  dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+Tensor& dequantize_per_tensor_asym16s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int16_t* input_data = input.const_data_ptr<int16_t>();
+  dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+Tensor& dequantize_per_tensor_asym16u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const uint16_t* input_data = input.const_data_ptr<uint16_t>();
+  dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
+}
+
+Tensor& dequantize_per_tensor_asym32s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int32_t* input_data = input.const_data_ptr<int32_t>();
+  dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
+  return out;
 }
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/quantize_per_tensor.cpp b/backends/cadence/generic/operators/quantize_per_tensor.cpp
index 29b233dab09..8ce70d2b51d 100644
--- a/backends/cadence/generic/operators/quantize_per_tensor.cpp
+++ b/backends/cadence/generic/operators/quantize_per_tensor.cpp
@@ -20,7 +20,7 @@ using ::impl::generic::kernels::quantize;
 
 // Quantize the input tensor (PT2 version). Note that quant_<min,max> are not
 // used in any computation.
-void quantize_per_tensor_out(
+Tensor& quantize_per_tensor_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     double scale,
@@ -34,30 +34,110 @@ void quantize_per_tensor_out(
 
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::generic::kernels::quantize<uint8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
-    impl::generic::kernels::quantize<int8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<int8_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else if (
       out.scalar_type() == ScalarType::Bits16 ||
       out.scalar_type() == ScalarType::UInt16) {
     uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    impl::generic::kernels::quantize<uint16_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<uint16_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Short) {
     int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    impl::generic::kernels::quantize<int16_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<int16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Int) {
+    int32_t* out_data = out.mutable_data_ptr<int32_t>();
+    quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
         false,
         "Unhandled input dtype %hhd",
         static_cast<int8_t>(out.scalar_type()));
   }
+  return out;
 }
 
-} // namespace native
-} // namespace generic
-} // namespace impl
+Tensor& quantize_per_tensor_asym8s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int8_t* out_data = out.mutable_data_ptr<int8_t>();
+  quantize<int8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+Tensor& quantize_per_tensor_asym8u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
+  quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+Tensor& quantize_per_tensor_asym16s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int16_t* out_data = out.mutable_data_ptr<int16_t>();
+  quantize<int16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+Tensor& quantize_per_tensor_asym16u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
+  quantize<uint16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+Tensor& quantize_per_tensor_asym32s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int32_t* out_data = out.mutable_data_ptr<int32_t>();
+  quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  return out;
+}
+
+}; // namespace native
+}; // namespace generic
+}; // namespace impl
diff --git a/backends/cadence/generic/operators/quantized_conv_nchw_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
similarity index 94%
rename from backends/cadence/generic/operators/quantized_conv_nchw_out.cpp
rename to backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
index 6eeabcf1d52..fbb01c82e65 100644
--- a/backends/cadence/generic/operators/quantized_conv_nchw_out.cpp
+++ b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
@@ -157,7 +157,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
 // bias_scale, since it is a product of the two. The kernel will branch to
 // quantized::conv1d or quantized::conv2d based on the dimensionality of
 // activation tensor.
-void quantized_conv_nchw(
+void quantized_conv2d_nchw(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
@@ -228,7 +228,7 @@ void quantized_conv_nchw(
 #undef typed_quantized_conv2d_nchw
 }
 
-void quantized_conv_nchw_out(
+void quantized_conv2d_nchw_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -248,7 +248,7 @@ void quantized_conv_nchw_out(
   const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
   const int32_t weight_zero_point_int =
       weight_zero_point.const_data_ptr<int32_t>()[0];
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -264,7 +264,7 @@ void quantized_conv_nchw_out(
       out);
 }
 
-void quantized_conv_nchw_per_tensor_out(
+void quantized_conv2d_nchw_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -282,7 +282,7 @@ void quantized_conv_nchw_per_tensor_out(
     __ET_UNUSED int64_t out_shift,
     bool channel_last,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -298,7 +298,7 @@ void quantized_conv_nchw_per_tensor_out(
       out);
 }
 
-void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -315,7 +315,7 @@ void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -331,7 +331,7 @@ void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -348,7 +348,7 @@ void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -364,7 +364,7 @@ void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -381,7 +381,7 @@ void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -397,7 +397,7 @@ void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -414,7 +414,7 @@ void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -430,7 +430,7 @@ void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -447,7 +447,7 @@ void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -463,7 +463,7 @@ void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -480,7 +480,7 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -496,7 +496,7 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -513,7 +513,7 @@ void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
@@ -529,7 +529,7 @@ void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -546,7 +546,7 @@ void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nchw(
+  quantized_conv2d_nchw(
       input,
       weight,
       bias,
diff --git a/backends/cadence/generic/operators/quantized_conv_nhwc_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
similarity index 94%
rename from backends/cadence/generic/operators/quantized_conv_nhwc_out.cpp
rename to backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
index d377048b142..eca836dcc94 100644
--- a/backends/cadence/generic/operators/quantized_conv_nhwc_out.cpp
+++ b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
@@ -144,7 +144,7 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
   }
 }
 
-void quantized_conv_nhwc(
+void quantized_conv2d_nhwc(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
@@ -215,7 +215,7 @@ void quantized_conv_nhwc(
 #undef typed_quantized_conv2d_nhwc
 }
 
-void quantized_conv_nhwc_out(
+void quantized_conv2d_nhwc_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -235,7 +235,7 @@ void quantized_conv_nhwc_out(
   const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
   const int32_t weight_zero_point_int =
       weight_zero_point.const_data_ptr<int32_t>()[0];
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -251,7 +251,7 @@ void quantized_conv_nhwc_out(
       out);
 }
 
-void quantized_conv_nhwc_per_tensor_out(
+void quantized_conv2d_nhwc_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -269,7 +269,7 @@ void quantized_conv_nhwc_per_tensor_out(
     __ET_UNUSED int64_t out_shift,
     bool channel_last,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -285,7 +285,7 @@ void quantized_conv_nhwc_per_tensor_out(
       out);
 }
 
-void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -302,7 +302,7 @@ void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -318,7 +318,7 @@ void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -335,7 +335,7 @@ void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -351,7 +351,7 @@ void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -368,7 +368,7 @@ void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -384,7 +384,7 @@ void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -401,7 +401,7 @@ void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -417,7 +417,7 @@ void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -434,7 +434,7 @@ void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -450,7 +450,7 @@ void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -467,7 +467,7 @@ void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -483,7 +483,7 @@ void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -500,7 +500,7 @@ void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
@@ -516,7 +516,7 @@ void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
       out);
 }
 
-void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -533,7 +533,7 @@ void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  quantized_conv_nhwc(
+  quantized_conv2d_nhwc(
       input,
       weight,
       bias,
diff --git a/backends/cadence/generic/operators/targets.bzl b/backends/cadence/generic/operators/targets.bzl
index 4ff821158bc..fa0f128b229 100644
--- a/backends/cadence/generic/operators/targets.bzl
+++ b/backends/cadence/generic/operators/targets.bzl
@@ -4,64 +4,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 def define_common_targets():
     # Individual operator targets with optimized dependencies
 
-    # Basic operators (need broadcast_util and scalar_utils)
-    runtime.cxx_library(
-        name = "op_add",
-        srcs = ["op_add.cpp"],
-        platforms = CXX,
-        deps = [
-            "//executorch/kernels/portable/cpu/util:broadcast_util",
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "op_full",
-        srcs = ["op_full.cpp"],
-        platforms = CXX,
-        deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-            "//executorch/kernels/portable/cpu:scalar_utils",
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    # Simple operators (only need kernel_includes)
-    runtime.cxx_library(
-        name = "op_embedding",
-        srcs = ["op_embedding.cpp"],
-        platforms = CXX,
-        deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    runtime.cxx_library(
-        name = "op_view_copy",
-        srcs = ["op_view_copy.cpp"],
-        platforms = CXX,
-        deps = [
-            "//executorch/runtime/kernel:kernel_includes",
-        ],
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    # Operators that need the operators.h header and basic runtime
     runtime.cxx_library(
         name = "im2row_out",
         srcs = ["im2row_out.cpp"],
@@ -102,6 +44,7 @@ def define_common_targets():
         ],
         visibility = [
             "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
         ],
     )
 
@@ -136,8 +79,8 @@ def define_common_targets():
     )
 
     runtime.cxx_library(
-        name = "quantized_conv_nchw_out",
-        srcs = ["quantized_conv_nchw_out.cpp"],
+        name = "quantized_conv2d_nchw_out",
+        srcs = ["quantized_conv2d_nchw_out.cpp"],
         exported_headers = ["operators.h", "quantized_ops.h"],
         platforms = CXX,
         deps = [
@@ -151,8 +94,8 @@ def define_common_targets():
     )
 
     runtime.cxx_library(
-        name = "quantized_conv_nhwc_out",
-        srcs = ["quantized_conv_nhwc_out.cpp"],
+        name = "quantized_conv2d_nhwc_out",
+        srcs = ["quantized_conv2d_nhwc_out.cpp"],
         exported_headers = ["operators.h", "quantized_ops.h"],
         platforms = CXX,
         deps = [
diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp
index d9223d7bd18..237c605443f 100644
--- a/backends/cadence/hifi/kernels/kernels.cpp
+++ b/backends/cadence/hifi/kernels/kernels.cpp
@@ -127,6 +127,7 @@ typed_quantize_val(int8_t);
 typed_quantize_val(uint8_t);
 typed_quantize_val(int16_t);
 typed_quantize_val(uint16_t);
+typed_quantize_val(int32_t);
 #undef typed_quantize_val
 
 #define typed_quantize_vec(dtype)  \
@@ -150,6 +151,7 @@ typed_dequantize_val(int8_t);
 typed_dequantize_val(uint8_t);
 typed_dequantize_val(int16_t);
 typed_dequantize_val(uint16_t);
+typed_dequantize_val(int32_t);
 #undef typed_dequantize_val
 
 #define typed_dequantize_vec(dtype) \
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 6bd63c6d9f6..26555da9760 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -96,8 +96,8 @@ add_library(
   "op_quantize_per_tensor.cpp"
   "op_quantized_relu_out.cpp"
   "op_dequantize_per_tensor.cpp"
-  "op_quantized_conv_nchw_out.cpp"
-  "op_quantized_conv_nhwc_out.cpp"
+  "op_quantized_conv2d_nchw_out.cpp"
+  "op_quantized_conv2d_nhwc_out.cpp"
   "op_quantized_fully_connected_out"
 )
 target_include_directories(
diff --git a/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
index f416082b10f..30ce938e24d 100644
--- a/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/op_dequantize_per_tensor.cpp
@@ -45,6 +45,9 @@ void dequantize_per_tensor_out(
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
     dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Int) {
+    const int32_t* input_data = input.const_data_ptr<int32_t>();
+    dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
         false,
@@ -53,6 +56,66 @@ void dequantize_per_tensor_out(
   }
 }
 
+void dequantize_per_tensor_asym8u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const uint8_t* input_data = input.const_data_ptr<uint8_t>();
+  dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
+}
+
+void dequantize_per_tensor_asym16s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int16_t* input_data = input.const_data_ptr<int16_t>();
+  dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
+}
+
+void dequantize_per_tensor_asym16u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const uint16_t* input_data = input.const_data_ptr<uint16_t>();
+  dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
+}
+
+void dequantize_per_tensor_asym32s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+  const int32_t* input_data = input.const_data_ptr<int32_t>();
+  dequantize<int32_t>(out_data, input_data, scale, zero_point, numel);
+}
+
 } // namespace native
 } // namespace HiFi
 } // namespace impl
diff --git a/backends/cadence/hifi/operators/op_dequantize_per_tensor_asym8s.cpp b/backends/cadence/hifi/operators/op_dequantize_per_tensor_asym8s.cpp
new file mode 100644
index 00000000000..d1099b1a4db
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_dequantize_per_tensor_asym8s.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <xa_type_def.h>
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <xa_nnlib_kernels_api.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void dequantize_per_tensor_asym8s_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    __ET_UNUSED int64_t quant_min,
+    __ET_UNUSED int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  const size_t numel = out.numel();
+  const int8_t* input_data = input.const_data_ptr<int8_t>();
+  xa_nn_elm_dequantize_asym8s_f32(
+      out_data, input_data, zero_point, scale, numel);
+}
+
+}; // namespace native
+}; // namespace HiFi
+}; // namespace impl
diff --git a/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
index b2f47619f05..579a4533057 100644
--- a/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
+++ b/backends/cadence/hifi/operators/op_quantize_per_tensor.cpp
@@ -19,10 +19,13 @@
 namespace impl {
 namespace HiFi {
 namespace native {
+
 namespace {
+
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::HiFi::kernels::quantize;
 
 // Add checks for dtype quant min/max bounds.
 template <typename T>
@@ -92,22 +95,22 @@ void quantize_per_tensor_out(
   const size_t numel = out.numel();
   if (out.scalar_type() == ScalarType::Byte) {
     uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
-    impl::HiFi::kernels::quantize<uint8_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Char) {
     int8_t* out_data = out.mutable_data_ptr<int8_t>();
     xa_nn_elm_quantize_f32_asym8s(
         out_data, input_data, scale, zero_point, numel);
   } else if (out.scalar_type() == ScalarType::Short) {
     int16_t* out_data = out.mutable_data_ptr<int16_t>();
-    impl::HiFi::kernels::quantize<int16_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<int16_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else if (
       out.scalar_type() == ScalarType::Bits16 ||
       out.scalar_type() == ScalarType::UInt16) {
     uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
-    impl::HiFi::kernels::quantize<uint16_t>(
-        out_data, input_data, 1. / scale, zero_point, numel);
+    quantize<uint16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Int) {
+    int32_t* out_data = out.mutable_data_ptr<int32_t>();
+    quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
   } else {
     ET_KERNEL_CHECK_MSG(
         ctx,
@@ -119,6 +122,66 @@ void quantize_per_tensor_out(
   }
 }
 
-} // namespace native
-} // namespace HiFi
-} // namespace impl
+void quantize_per_tensor_asym8u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
+  quantize<uint8_t>(out_data, input_data, 1. / scale, zero_point, numel);
+}
+
+void quantize_per_tensor_asym16s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int16_t* out_data = out.mutable_data_ptr<int16_t>();
+  quantize<int16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+}
+
+void quantize_per_tensor_asym16u_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
+  quantize<uint16_t>(out_data, input_data, 1. / scale, zero_point, numel);
+}
+
+void quantize_per_tensor_asym32s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int32_t* out_data = out.mutable_data_ptr<int32_t>();
+  quantize<int32_t>(out_data, input_data, 1. / scale, zero_point, numel);
+}
+
+}; // namespace native
+}; // namespace HiFi
+}; // namespace impl
diff --git a/backends/cadence/hifi/operators/op_quantize_per_tensor_asym8s.cpp b/backends/cadence/hifi/operators/op_quantize_per_tensor_asym8s.cpp
new file mode 100644
index 00000000000..552b6acf150
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantize_per_tensor_asym8s.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <xa_type_def.h>
+
+#include <xa_nnlib_kernels_api.h>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantize_per_tensor_asym8s_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+  int8_t* out_data = out.mutable_data_ptr<int8_t>();
+  xa_nn_elm_quantize_f32_asym8s(out_data, input_data, scale, zero_point, numel);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 96%
rename from backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
index 566325e0f10..b5ab0cdbaa2 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NCHW 1D convolution for int8 x int8 -> int8
-void xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s(
+void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -144,7 +144,7 @@ void xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -161,7 +161,7 @@ void quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv1d_nchw_asym8sxsym8s_asym8s(
+  xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 96%
rename from backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
index de5f76b0fff..60e700f563b 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NCHW 1D convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u(
+void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -144,7 +144,7 @@ void xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u(
   }
 }
 
-void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -161,7 +161,7 @@ void quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv1d_nchw_asym8uxsym8u_asym8u(
+  xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 95%
rename from backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
index b549ad13307..c9a3d2b58de 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NHWC 1D convolution for int8 x int8 -> int8
-void xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s(
+void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -93,7 +93,7 @@ void xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -110,7 +110,7 @@ void quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv1d_nhwc_asym8sxsym8s_asym8s(
+  xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 95%
rename from backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
index f5dbb083522..2d7a4cba509 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NHWC 1D convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u(
+void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -93,7 +93,7 @@ void xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u(
   }
 }
 
-void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -110,7 +110,7 @@ void quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv1d_nhwc_asym8uxsym8u_asym8u(
+  xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 97%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
index e4074829cf0..e2584485686 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NCHW convolution for int8 x int8 -> int8
-void xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s(
+void xa_opt_quantized_conv2d_nchw_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -207,7 +207,7 @@ void xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -224,7 +224,7 @@ void quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nchw_asym8sxsym8s_asym8s(
+  xa_opt_quantized_conv2d_nchw_asym8sxsym8s_asym8s(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 97%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 201b5d7da16..8444fef6bd1 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NCHW convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u(
+void xa_opt_quantized_conv2d_nchw_asym8uxsym8u_asym8u(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -207,7 +207,7 @@ void xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u(
   }
 }
 
-void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -224,7 +224,7 @@ void quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nchw_asym8uxsym8u_asym8u(
+  xa_opt_quantized_conv2d_nchw_asym8uxsym8u_asym8u(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 96%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
index a0e47104e18..787984e52db 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Specialized depthwise NCHW convolution for int8 x int8 -> int8
-void xa_opt_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s(
+void xa_opt_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -162,7 +162,7 @@ void xa_opt_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s(
       kNnlibMaxDim);
 }
 
-void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -179,7 +179,7 @@ void quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s(
+  xa_opt_quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 96%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 03274413f65..219eaf44ad7 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Specialized depthwise NCHW convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u(
+void xa_opt_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -162,7 +162,7 @@ void xa_opt_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u(
       kNnlibMaxDim);
 }
 
-void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -179,7 +179,7 @@ void quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u(
+  xa_opt_quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
index 34c861faed5..fc279f2bbdf 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -122,7 +122,7 @@ __attribute__((noinline)) void conv2d_nchw_dilated_asym8sxsym8s_asym8s_core(
   }
 }
 
-void quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 6393554e18f..08ca4657c75 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -123,7 +123,7 @@ __attribute__((noinline)) void conv2d_nchw_dilated_asym8uxsym8u_asym8u_core(
   }
 }
 
-void quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp
index 604f881ab96..984747d9316 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nchw_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nchw_out.cpp
@@ -156,7 +156,7 @@ __attribute__((noinline)) void conv2d_nchw_core_generic(
   }
 }
 
-void xa_opt_quantized_conv_nchw(
+void xa_opt_quantized_conv2d_nchw(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -444,7 +444,7 @@ void xa_opt_quantized_conv_nchw(
 // bias_scale, since it is a product of the two. The kernel will branch to
 // quantized::conv1d or quantized::conv2d based on the dimensionality of
 // activation tensor.
-void quantized_conv_nchw(
+void quantized_conv2d_nchw(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
@@ -515,7 +515,7 @@ void quantized_conv_nchw(
 #undef typed_quantized_conv2d_nchw
 }
 
-void quantized_conv_nchw_out(
+void quantized_conv2d_nchw_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -546,7 +546,7 @@ void quantized_conv_nchw_out(
     optimized = 0;
 
   if (optimized) {
-    xa_opt_quantized_conv_nchw(
+    xa_opt_quantized_conv2d_nchw(
         ctx,
         input,
         weight,
@@ -562,7 +562,7 @@ void quantized_conv_nchw_out(
         output_zero_point,
         out);
   } else {
-    quantized_conv_nchw(
+    quantized_conv2d_nchw(
         input,
         weight,
         bias,
@@ -579,7 +579,7 @@ void quantized_conv_nchw_out(
   }
 }
 
-void quantized_conv_nchw_per_tensor_out(
+void quantized_conv2d_nchw_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -606,7 +606,7 @@ void quantized_conv_nchw_per_tensor_out(
     optimized = 0;
 
   if (optimized) {
-    xa_opt_quantized_conv_nchw(
+    xa_opt_quantized_conv2d_nchw(
         ctx,
         input,
         weight,
@@ -622,7 +622,7 @@ void quantized_conv_nchw_per_tensor_out(
         output_zero_point,
         out);
   } else {
-    quantized_conv_nchw(
+    quantized_conv2d_nchw(
         input,
         weight,
         bias,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 96%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
index 3f62c82bfcd..9bd7e641144 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NHWC convolution for int8 x int8 -> int8
-void xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s(
+void xa_opt_quantized_conv2d_nhwc_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -150,7 +150,7 @@ void xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -167,7 +167,7 @@ void quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nhwc_asym8sxsym8s_asym8s(
+  xa_opt_quantized_conv2d_nhwc_asym8sxsym8s_asym8s(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 96%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 32267591cf3..433cbf76fce 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Optimized NHWC convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u(
+void xa_opt_quantized_conv2d_nhwc_asym8uxsym8u_asym8u(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -150,7 +150,7 @@ void xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u(
   }
 }
 
-void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -167,7 +167,7 @@ void quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nhwc_asym8uxsym8u_asym8u(
+  xa_opt_quantized_conv2d_nhwc_asym8uxsym8u_asym8u(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 95%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
index c232f7e5ef2..384ebbb4f48 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Specialized depthwise NHWC convolution for int8 x int8 -> int8
-void xa_opt_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s(
+void xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -132,7 +132,7 @@ void xa_opt_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s(
   }
 }
 
-void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -149,7 +149,7 @@ void quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s(
+  xa_opt_quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 95%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 5ef102c31d1..07df1a416d7 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -22,7 +22,7 @@ namespace HiFi {
 namespace native {
 
 // Specialized depthwise NHWC convolution for uint8 x uint8 -> uint8
-void xa_opt_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u(
+void xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -132,7 +132,7 @@ void xa_opt_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u(
   }
 }
 
-void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -149,7 +149,7 @@ void quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
     Tensor& out) {
-  xa_opt_quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u(
+  xa_opt_quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u(
       ctx,
       input,
       weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
index 35a1cbda0f9..91965594a5d 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out.cpp
@@ -122,7 +122,7 @@ __attribute__((noinline)) void conv2d_nhwc_dilated_asym8sxsym8s_asym8s_core(
   }
 }
 
-void quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out(
+void quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
index 62b5008ab7e..14dc31a719f 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out.cpp
@@ -122,7 +122,7 @@ __attribute__((noinline)) void conv2d_nhwc_dilated_asym8uxsym8u_asym8u_core(
   }
 }
 
-void quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
+void quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
diff --git a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
similarity index 98%
rename from backends/cadence/hifi/operators/op_quantized_conv_nhwc_out.cpp
rename to backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
index 5aa087c4b75..a5d503853c4 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv_nhwc_out.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv2d_nhwc_out.cpp
@@ -147,7 +147,7 @@ __attribute__((noinline)) void conv2d_nhwc_core_generic(
   }
 }
 
-void xa_opt_quantized_conv_nhwc(
+void xa_opt_quantized_conv2d_nhwc(
     KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -350,7 +350,7 @@ void xa_opt_quantized_conv_nhwc(
   }
 }
 
-void quantized_conv_nhwc(
+void quantized_conv2d_nhwc(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
@@ -421,7 +421,7 @@ void quantized_conv_nhwc(
 #undef typed_quantized_conv2d_nhwc
 }
 
-void quantized_conv_nhwc_out(
+void quantized_conv2d_nhwc_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -452,7 +452,7 @@ void quantized_conv_nhwc_out(
     optimized = 0;
 
   if (optimized) {
-    xa_opt_quantized_conv_nhwc(
+    xa_opt_quantized_conv2d_nhwc(
         ctx,
         input,
         weight,
@@ -468,7 +468,7 @@ void quantized_conv_nhwc_out(
         output_zero_point,
         out);
   } else {
-    quantized_conv_nhwc(
+    quantized_conv2d_nhwc(
         input,
         weight,
         bias,
@@ -485,7 +485,7 @@ void quantized_conv_nhwc_out(
   }
 }
 
-void quantized_conv_nhwc_per_tensor_out(
+void quantized_conv2d_nhwc_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& weight,
@@ -512,7 +512,7 @@ void quantized_conv_nhwc_per_tensor_out(
     optimized = 0;
 
   if (optimized) {
-    xa_opt_quantized_conv_nhwc(
+    xa_opt_quantized_conv2d_nhwc(
         ctx,
         input,
         weight,
@@ -528,7 +528,7 @@ void quantized_conv_nhwc_per_tensor_out(
         output_zero_point,
         out);
   } else {
-    quantized_conv_nhwc(
+    quantized_conv2d_nhwc(
         input,
         weight,
         bias,
diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h
index 11b93f4a89c..f7f5194d91a 100644
--- a/backends/cadence/hifi/operators/operators.h
+++ b/backends/cadence/hifi/operators/operators.h
@@ -83,7 +83,7 @@ void quantized_linear_per_tensor_out(
     const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
     ::executorch::aten::Tensor& out);
 
-void quantized_conv_nhwc_out(
+void quantized_conv2d_nhwc_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& weight,
@@ -101,7 +101,7 @@ void quantized_conv_nhwc_out(
     const ::executorch::aten::Tensor& out_shift,
     ::executorch::aten::Tensor& out);
 
-void quantized_conv_nchw_out(
+void quantized_conv2d_nchw_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& weight,
@@ -119,7 +119,7 @@ void quantized_conv_nchw_out(
     const ::executorch::aten::Tensor& out_shift,
     ::executorch::aten::Tensor& out);
 
-void quantized_conv_nchw_per_tensor_out(
+void quantized_conv2d_nchw_per_tensor_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& weight,
@@ -137,7 +137,7 @@ void quantized_conv_nchw_per_tensor_out(
     int64_t out_shift,
     ::executorch::aten::Tensor& out);
 
-void quantized_conv_nhwc_per_tensor_out(
+void quantized_conv2d_nhwc_per_tensor_out(
     ::executorch::runtime::KernelRuntimeContext& ctx,
     const ::executorch::aten::Tensor& input,
     const ::executorch::aten::Tensor& weight,
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index fa263d4017c..1f9814c4a4e 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -44,6 +44,7 @@ OPERATORS = [
     "cat",
     "clamp",
     "dequantize_per_tensor",
+    "dequantize_per_tensor_asym8s",
     "div",
     "embedding",
     "eq",
@@ -63,24 +64,24 @@ OPERATORS = [
     "ne",
     "permute_copy",
     "pow",
-    "quantized_conv_nchw_out",
-    "quantized_conv_nchw_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv_nchw_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv1d_nchw_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv1d_nchw_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv_nhwc_out",
-    "quantized_conv_nhwc_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv_nhwc_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv1d_nhwc_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv1d_nhwc_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out",
-    "quantized_conv_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out",
-    "quantized_conv_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv2d_nchw_out",
+    "quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv2d_nchw_dilated_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv2d_nchw_dilated_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv2d_nhwc_out",
+    "quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv2d_nhwc_dilated_asym8sxsym8s_asym8s_per_tensor_out",
+    "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_fully_connected_out",
     "quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out",
     "quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out",
@@ -95,6 +96,7 @@ OPERATORS = [
     "quantized_relu_asym8s_asym8s_per_tensor_out",
     "quantized_relu_asym8u_asym8u_per_tensor_out",
     "quantize_per_tensor",
+    "quantize_per_tensor_asym8s",
     "remainder",
     "rsqrt",
     "select_copy",
diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index 9c65c469280..65a578f4751 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -21,6 +21,7 @@ runtime.python_library(
         "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/devtools:lib",
         "//executorch/exir:lib",
+        ":etdump",
     ],
 )
 
diff --git a/backends/cadence/runtime/etdump.py b/backends/cadence/runtime/etdump.py
new file mode 100644
index 00000000000..4ef5d28285a
--- /dev/null
+++ b/backends/cadence/runtime/etdump.py
@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import logging
+import os
+from typing import cast, Optional, Tuple
+
+import torch
+from executorch.devtools import Inspector
+from executorch.devtools.inspector import Event, EventBlock, PerfData
+from executorch.devtools.inspector._inspector_utils import TimeScale
+from tabulate import tabulate
+
+
+class CadenceETDump:
+    def __init__(self, output_dir: str) -> None:
+        self.tensor_dump_dir: str = os.path.join(output_dir, "tensors")
+        self.etdump_path: str = os.path.join(output_dir, "etdump.etdp")
+        self.etrecord_path: Optional[str] = os.path.join(output_dir, "etrecord.bin")
+        self.debug_buffer_path: Optional[str] = os.path.join(
+            output_dir, "debug_output.bin"
+        )
+
+        if not os.path.exists(self.etdump_path):
+            raise RuntimeError(f"{self.etdump_path} does not exist")
+        # pyre-ignore[6]: os.path.exists expects str, but got Optional[str]
+        if not os.path.exists(self.etrecord_path):
+            logging.warning(
+                "ETRecord not found, intermediate tensors will not be dumped"
+            )
+            self.etrecord_path = None
+        # pyre-ignore[6]: os.path.exists expects str, but got Optional[str]
+        if not os.path.exists(self.debug_buffer_path):
+            logging.warning(
+                "Debug buffer not found, intermediate tensors will not be dumped"
+            )
+            self.debug_buffer_path = None
+
+        self.et_inspector: Inspector = Inspector(
+            etdump_path=self.etdump_path,
+            debug_buffer_path=self.debug_buffer_path,
+            etrecord=self.etrecord_path,
+            source_time_scale=TimeScale.CYCLES,
+            target_time_scale=TimeScale.CYCLES,
+        )
+
+    def get_outputs(self, log_to_stdout: bool = False) -> Tuple[torch.Tensor]:
+        output = [
+            event_block.run_output
+            for event_block in self.et_inspector.event_blocks
+            if event_block.name == "Execute"
+        ]
+        logging.debug(f"[CadenceETDump] output: {output}")
+        return output[0]
+
+    def get_execute_event_block(self) -> EventBlock:
+        exec_blocks = [
+            eb for eb in self.et_inspector.event_blocks if eb.name == "Execute"
+        ]
+        return exec_blocks[0]
+
+    def should_include_event(self, event: Event) -> bool:
+        # exclude duplicate events
+        if event.name in ("OPERATOR_CALL", "Method::execute"):
+            return False
+
+        # exclude custom multi-zion events
+        if event.name.startswith("DELEGATE_ZION"):
+            return False
+
+        return True
+
+    def print_summary(
+        self,
+        bundled_prog_size: Optional[int] = None,
+        external_link: Optional[str] = None,
+    ) -> None:
+        """
+        Print performance summary with optional program size and external link.
+
+        Args:
+            bundled_prog_size: Size of the bundled program in bytes (optional)
+            external_link: External analytics/monitoring link (optional, e.g., Scuba link for Meta internal use)
+        """
+        block = self.get_execute_event_block()
+        op_events = [e for e in block.events if self.should_include_event(e)]
+        op_time_sum = sum([cast(PerfData, e.perf_data).avg for e in op_events])
+
+        overall_event = [ev for ev in block.events if ev.name == "Method::execute"]
+        if not len(overall_event) == 1:
+            logging.warning(
+                f"Expected one 'Method::execute' event, found {len(overall_event)}"
+            )
+
+        total_cycles = cast(PerfData, overall_event[0].perf_data).avg
+        op_cycles = op_time_sum
+
+        # Build table data and headers dynamically based on what's provided
+        table_data = [
+            "{:,.0f}".format(total_cycles),
+            "{:,.0f}".format(op_cycles),
+            "{:,.0f}".format(total_cycles - op_cycles),
+            "{:.2%}".format((total_cycles - op_cycles) / total_cycles),
+        ]
+        headers = [
+            "Total Cycles",
+            "Cycles in Ops",
+            "Other Cycles",
+            "Framework Tax (%)",
+        ]
+
+        # Add optional fields if provided
+        if bundled_prog_size is not None:
+            table_data.append("{:,.0f}".format(bundled_prog_size))
+            headers.append("Bundled Program Size (bytes)")
+
+        if external_link is not None:
+            table_data.append(external_link)
+            headers.append("External Link")
+
+        logging.info(
+            "Performance Summary:\n%s",
+            tabulate(
+                [table_data],
+                headers=headers,
+                tablefmt="outline",
+            ),
+        )
+
+    def print_event_block(self) -> None:
+        logging.info("Profiled events:")
+        if logging.getLogger().level <= logging.INFO:
+            self.et_inspector.print_data_tabular()
+
+    def dump_intermediate_tensors(self) -> None:
+        if self.etrecord_path is None:
+            logging.info("[CadenceETDump] Intermediate tensors not available")
+            return
+
+        logging.info(
+            f"[CadenceETDump] Dumping intermediate tensors to {self.tensor_dump_dir}"
+        )
+        os.makedirs(self.tensor_dump_dir, exist_ok=True)
+        exec_blocks = [
+            eb for eb in self.et_inspector.event_blocks if eb.name == "Execute"
+        ]
+        if len(exec_blocks) > 1:
+            logging.warning(
+                f'Found {len(exec_blocks)} "Execute" blocks, using the first one and ignoring the rest.'
+            )
+        block = exec_blocks[0]
+
+        # OPERATOR_CALL events are duplicates that contain framework tax data. We don't need them
+        op_events = [e for e in block.events if e.name != "OPERATOR_CALL"]
+        torch.set_printoptions(profile="full")
+
+        for event in op_events:
+            instr_id = event._instruction_id
+            if not event.debug_data:
+                logging.debug(
+                    f"Missing intermediate tensor data for {event.name} ({instr_id=})"
+                )
+                continue
+
+            with open(f"{self.tensor_dump_dir}/{instr_id}.txt", "w") as f:
+                for dd in event.debug_data:
+                    f.write(f"{str(dd)}\n\n")
+        torch.set_printoptions(profile="default")
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
index 4d1c876bcdb..a7d35fbd0c9 100644
--- a/backends/cadence/runtime/runtime.py
+++ b/backends/cadence/runtime/runtime.py
@@ -9,9 +9,8 @@
 
 import logging
 import numbers
-import os
 import tempfile
-from typing import Any, Optional, Sequence, Tuple, Union
+from typing import Any, Optional, Sequence, Union
 
 import executorch.exir.schema as et_schema
 
@@ -19,8 +18,8 @@
 import torch
 
 from executorch.backends.cadence.runtime import utils
+from executorch.backends.cadence.runtime.etdump import CadenceETDump
 from executorch.backends.cadence.runtime.executor import Executor
-from executorch.devtools import Inspector
 from executorch.exir import ExecutorchProgramManager
 from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.schema import DataLocation
@@ -30,90 +29,6 @@
 from torch.utils._pytree import TreeSpec
 
 
-class CadenceETDump:
-    def __init__(self, output_dir: str) -> None:
-        self.tensor_dump_dir: str = os.path.join(output_dir, "tensors")
-        self.etdump_path: str = os.path.join(output_dir, "etdump.etdp")
-        self.etrecord_path: Optional[str] = os.path.join(output_dir, "etrecord.bin")
-        self.debug_buffer_path: Optional[str] = os.path.join(
-            output_dir, "debug_output.bin"
-        )
-
-        if not os.path.exists(self.etdump_path):
-            raise RuntimeError(f"{self.etdump_path} does not exist")
-        # pyre-ignore[6]: os.path.exists expects str, but got Optional[str]
-        if not os.path.exists(self.etrecord_path):
-            logging.warning(
-                "ETRecord not found, intermediate tensors will not be dumped"
-            )
-            self.etrecord_path = None
-        # pyre-ignore[6]: os.path.exists expects str, but got Optional[str]
-        if not os.path.exists(self.debug_buffer_path):
-            logging.warning(
-                "Debug buffer not found, intermediate tensors will not be dumped"
-            )
-            self.debug_buffer_path = None
-
-        self.et_inspector: Inspector = Inspector(
-            etdump_path=self.etdump_path,
-            debug_buffer_path=self.debug_buffer_path,
-            etrecord=self.etrecord_path,
-        )
-
-    def get_outputs(self, log_to_stdout: bool = False) -> Tuple[torch.Tensor]:
-        output = [
-            event_block.run_output
-            for event_block in self.et_inspector.event_blocks
-            if event_block.name == "Execute"
-        ]
-        logging.debug(f"[ETdump] output: {output}")
-        return output[0]
-
-    def print_event_block(self) -> None:
-        logging.debug("[ETdump] data tabular:")
-        if logging.getLogger().level <= logging.DEBUG:
-            self.et_inspector.print_data_tabular()
-
-    def print_event_data(self) -> None:
-        logging.debug("[ETdump] event data ")
-        for event_block in self.et_inspector.event_blocks:
-            for event in event_block.events:
-                logging.debug(event)
-
-    def dump_intermediate_tensors(self) -> None:
-        if self.etrecord_path is None:
-            logging.info("[ETdump] Intermediate tensors not available")
-            return
-
-        logging.info(f"[ETdump] Dumping intermediate tensors to {self.tensor_dump_dir}")
-        os.makedirs(self.tensor_dump_dir, exist_ok=True)
-        exec_blocks = [
-            eb for eb in self.et_inspector.event_blocks if eb.name == "Execute"
-        ]
-        if len(exec_blocks) > 1:
-            logging.warning(
-                f'Found {len(exec_blocks)} "Execute" blocks, using the first one and ignoring the rest.'
-            )
-        block = exec_blocks[0]
-
-        # OPERATOR_CALL events are duplicates that contain framework tax data. We don't need them
-        op_events = [e for e in block.events if e.name != "OPERATOR_CALL"]
-        torch.set_printoptions(profile="full")
-
-        for event in op_events:
-            instr_id = event._instruction_id
-            if not event.debug_data:
-                logging.debug(
-                    f"Missing intermediate tensor data for {event.name} ({instr_id=})"
-                )
-                continue
-
-            with open(f"{self.tensor_dump_dir}/{instr_id}.txt", "w") as f:
-                for dd in event.debug_data:
-                    f.write(f"{str(dd)}\n\n")
-        torch.set_printoptions(profile="default")
-
-
 def get_op_names(program: et_schema.Program, execution_plan_id: int = 0) -> set[str]:
     """
     Get the list of operators from a Program
@@ -162,6 +77,9 @@ def run(
     etdump = CadenceETDump(output_dir=working_dir)
     outputs = etdump.get_outputs()
 
+    # Print performance summary
+    etdump.print_summary()
+
     assert isinstance(out_spec, TreeSpec)
     outputs = torch.utils._pytree.tree_unflatten(outputs, out_spec)
 
diff --git a/backends/cadence/runtime/targets.bzl b/backends/cadence/runtime/targets.bzl
index dabe42ad824..09a116764c2 100644
--- a/backends/cadence/runtime/targets.bzl
+++ b/backends/cadence/runtime/targets.bzl
@@ -13,3 +13,17 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
         ],
     )
+
+    runtime.python_library(
+        name = "etdump",
+        srcs = ["etdump.py"],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS"
+        ],
+        deps = [
+            "fbcode//executorch/devtools:lib",
+            "fbcode//executorch/devtools/inspector:inspector_utils",
+            "fbsource//third-party/pypi/tabulate:tabulate",
+        ],
+    )
diff --git a/backends/cadence/utils/facto_util.py b/backends/cadence/utils/facto_util.py
index 5b204e99fcb..e49cf412c19 100644
--- a/backends/cadence/utils/facto_util.py
+++ b/backends/cadence/utils/facto_util.py
@@ -22,9 +22,95 @@
 MAX_CASES = 50
 
 
+# Global cache to store generated shapes per tensor to ensure consistency
+_shape_cache: dict[str, list[int]] = {}
+
+
 def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
-    # Constraint to limit tensor size product to < 4000
-    max_size_constraint = cp.Size.Le(lambda deps, r, d: max(1, int((3999) ** (1 / r))))
+    # Constraint to limit tensor size to < 4000 bytes with fully randomized shapes
+    import random
+
+    def get_dtype_bytes(dtype: torch.dtype) -> int:
+        """Get the number of bytes per element for a given dtype"""
+        dtype_bytes = {
+            torch.int8: 1,
+            torch.uint8: 1,
+            torch.int16: 2,
+            torch.uint16: 2,
+            torch.int32: 4,
+            torch.float32: 4,
+            torch.int64: 8,
+            torch.float64: 8,
+            torch.bool: 1,
+            torch.float: 4,  # alias for float32
+            torch.int: 4,  # alias for int32
+            torch.long: 8,  # alias for int64
+        }
+        return dtype_bytes.get(dtype, 4)  # Default to 4 bytes if dtype not found
+
+    def generate_random_shape_with_byte_limit(
+        rank: int, dtype: torch.dtype, max_bytes: int = 3999, seed_base: int = 42
+    ) -> list[int]:
+        """Generate a random shape with given rank ensuring total byte size < max_bytes"""
+        random.seed(seed_base + rank)
+
+        bytes_per_element = get_dtype_bytes(dtype)
+        max_elements = max_bytes // bytes_per_element
+
+        # Start with all dimensions as 1
+        shape = [1] * rank
+        remaining_elements = (
+            max_elements - 1
+        )  # Leave room since we start with product=1
+
+        # Randomly distribute the remaining capacity across dimensions
+        for i in range(rank):
+            if remaining_elements <= 1:
+                break
+
+            # Calculate maximum size this dimension can have without exceeding limit
+            current_product = 1
+            for j in range(rank):
+                if j != i:
+                    current_product *= shape[j]
+
+            max_size_for_dim = min(
+                remaining_elements // current_product, 50
+            )  # Cap at 50
+            if max_size_for_dim > shape[i]:
+                # Randomly choose a size between current and max
+                new_size = random.randint(shape[i], max_size_for_dim)
+                shape[i] = new_size
+                remaining_elements = max_elements // (current_product * new_size)
+                remaining_elements = max(1, remaining_elements)
+
+        # Final random shuffle of the dimensions to make it more random
+        random.shuffle(shape)
+        return shape
+
+    def random_size_constraint(deps: object, r: int, d: int) -> int:
+        """Generate random sizes ensuring total byte size < 4000 bytes"""
+        # Use conservative approach: assume worst case is 4 bytes per element (float32/int32)
+        # This ensures we never exceed 4000 bytes regardless of actual dtype
+        worst_case_dtype = torch.float32  # 4 bytes per element
+
+        # Create a unique key for this tensor configuration
+        cache_key = f"{r}_{d}_conservative"
+
+        if cache_key not in _shape_cache:
+            # Generate a new random shape for this rank using worst-case byte estimation
+            shape = generate_random_shape_with_byte_limit(
+                r, worst_case_dtype, max_bytes=3999, seed_base=42 + r * 10 + d
+            )
+            _shape_cache[cache_key] = shape
+
+        # Return the size for dimension d, ensuring we don't go out of bounds
+        cached_shape = _shape_cache[cache_key]
+        return cached_shape[d] if d < len(cached_shape) else 1
+
+    max_size_constraint = cp.Size.Le(
+        lambda deps, r, d: random_size_constraint(deps, r, d)
+    )
 
     tensor_constraints = (
         [
@@ -81,7 +167,7 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
                     cp.Size.Ge(lambda deps, r, d: 1),
                     max_size_constraint,
                 ]
-            else:
+            elif index == 1:  # input tensor(a)
                 tensor_constraints = [
                     cp.Dtype.In(
                         lambda deps: [
@@ -99,6 +185,25 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
                     cp.Size.Ge(lambda deps, r, d: 1),
                     max_size_constraint,
                 ]
+            else:  # input tensor(b)
+                tensor_constraints = [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int8,
+                            torch.int16,
+                            torch.uint8,
+                            torch.uint16,
+                            torch.int32,
+                            torch.float32,
+                        ]
+                    ),
+                    cp.Dtype.Eq(lambda deps: deps[1].dtype),
+                    cp.Value.Ge(lambda deps, dtype, struct: -(2**4)),
+                    cp.Value.Le(lambda deps, dtype, struct: 2**4),
+                    cp.Rank.Ge(lambda deps: 1),
+                    cp.Size.Ge(lambda deps, r, d: 1),
+                    max_size_constraint,
+                ]
         case "embedding.default":
             tensor_constraints = [
                 cp.Dtype.In(lambda deps: [torch.float, torch.int]),
@@ -117,6 +222,34 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
                     cp.Value.Le(lambda deps, dtype, struct: 2),
                 ]
             )
+        case "transpose_copy.int":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
+        case "permute_copy.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int8, torch.uint8]),
+                    cp.Rank.Le(
+                        lambda deps: 5
+                    ),  # xa_nn_transpose only supports up to 5D
+                    cp.Rank.Ge(lambda deps: 1),  # Must have at least 1 dimension
+                ]
+            )
+        case "sqrt.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
+        case "clamp.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
         case "rsqrt.default":
             tensor_constraints.extend(
                 [
@@ -127,6 +260,12 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "relu.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
+                ]
+            )
         case "mean.dim":
             tensor_constraints.extend(
                 [
@@ -136,10 +275,17 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
         case "exp.default":
             tensor_constraints.extend(
                 [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
                     cp.Value.Ge(lambda deps, dtype, struct: -(2**2)),
                     cp.Value.Le(lambda deps, dtype, struct: 2**2),
                 ]
             )
+        case "tanh.default":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32]),
+                ]
+            )
         case "slice_copy.Tensor":
             tensor_constraints.extend(
                 [
@@ -148,6 +294,34 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
                     cp.Value.Le(lambda deps, dtype, struct: 2),
                 ]
             )
+        case "div.Scalar" | "add.Tensor" | "mul.Tensor" | "sub.Tensor":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int32,
+                            torch.int64,
+                            torch.float32,
+                        ]
+                    ),
+                ]
+            )
+        case "split_copy.Tensor":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(
+                        lambda deps: [
+                            torch.int32,
+                            torch.int64,
+                            torch.float32,
+                        ]
+                    ),
+                    cp.Value.Ge(lambda deps, dtype, struct: 1),
+                    cp.Value.Le(lambda deps, dtype, struct: 2**3),
+                    cp.Rank.Le(lambda deps: 3),
+                    cp.Size.Le(lambda deps, r, d: 2**2),
+                ]
+            )
         case "constant_pad_nd.default":
             tensor_constraints.extend(
                 [
@@ -178,6 +352,12 @@ def apply_tensor_contraints(op_name: str, index: int) -> list[object]:
                     cp.Rank.Le(lambda deps: 2**2),
                 ]
             )
+        case "pow.Tensor_Scalar":
+            tensor_constraints.extend(
+                [
+                    cp.Dtype.In(lambda deps: [torch.float32, torch.int32]),
+                ]
+            )
         case "div.Tensor_mode" | "minimum.default":
             if index == 0:
                 tensor_constraints = [
diff --git a/backends/cadence/vision/kernels/CMakeLists.txt b/backends/cadence/vision/kernels/CMakeLists.txt
new file mode 100644
index 00000000000..fa7b2b5203b
--- /dev/null
+++ b/backends/cadence/vision/kernels/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# lint_cmake: -linelength
+add_library(
+  cadence_kernels
+  kernels.cpp
+  ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/tensor_transposef.c
+  ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
+  ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/expf_tbl.c
+  ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
+  ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/library/tables/inff_tbl.c
+)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+)
+
+target_include_directories(
+  cadence_kernels
+  PUBLIC . ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include
+         ${EXECUTORCH_ROOT}/backends/cadence/vision/third-party/include_private
+         ${_common_include_directories}
+)
+
+target_link_libraries(cadence_kernels PRIVATE idma)
diff --git a/backends/cadence/vision/kernels/kernels.cpp b/backends/cadence/vision/kernels/kernels.cpp
new file mode 100644
index 00000000000..70c811df741
--- /dev/null
+++ b/backends/cadence/vision/kernels/kernels.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <math.h>
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <numeric>
+
+namespace impl {
+namespace vision {
+namespace kernels {
+
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) {
+  Result<void*> temp_mem_res = ctx.allocate_temp(size);
+  return temp_mem_res.ok() ? temp_mem_res.get() : nullptr;
+}
+
+// Quantize a fp32 value to an int8_t/uint8_t value
+template <typename T>
+T quantize(const float x, float scale, int32_t zero_point) {
+  constexpr float min_val = std::numeric_limits<T>::min();
+  constexpr float max_val = std::numeric_limits<T>::max();
+  float tmp = roundf(x * scale + zero_point);
+  return std::max(std::min(tmp, max_val), min_val);
+}
+
+// Quantize an fp32 array to an int8_t/uint8_t array
+template <typename T>
+void quantize(
+    T* __restrict__ y,
+    const float* __restrict__ x,
+    float inv_scale,
+    int32_t zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = quantize<T>(x[i], inv_scale, zero_point);
+  }
+}
+
+// Dequantize an int8_t/uint8_t value to an fp32 value
+template <typename T>
+float dequantize(const T x, float scale, int32_t zero_point) {
+  return scale * (x - zero_point);
+}
+
+// Dequantize an int8_t/uint8_t/int16_t array to an fp32 array
+template <typename T>
+void dequantize(
+    float* __restrict__ y,
+    const T* __restrict__ x,
+    float scale,
+    int32_t zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = dequantize<T>(x[i], scale, zero_point);
+  }
+}
+
+// Requantize the int8_t/uint8_t in value to a uint8_t/int8_t out value.
+// The scale and zero_point for requantization are in the args.
+template <typename IT, typename OT>
+OT requantize(
+    const IT in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point) {
+  float dequant = dequantize<IT>(in, in_scale, in_zero_point);
+  return quantize<OT>(dequant, inv_out_scale, out_zero_point);
+}
+
+// Requantize the int8_t/uint8_t in array to a uint8_t/int8_t out array.
+// The scale and zero_point for requantization are in the args.
+template <typename IT, typename OT>
+void requantize(
+    OT* __restrict__ out,
+    const IT* __restrict__ in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    out[i] = requantize<IT, OT>(
+        in[i], in_scale, in_zero_point, inv_out_scale, out_zero_point);
+  }
+}
+
+// explicit template instantiation
+
+#define typed_quantize_val(dtype) \
+  template dtype quantize(const float x, float inv_scale, int32_t zero_point);
+typed_quantize_val(int8_t);
+typed_quantize_val(uint8_t);
+typed_quantize_val(int16_t);
+typed_quantize_val(uint16_t);
+typed_quantize_val(int32_t);
+#undef typed_quantize_val
+
+#define typed_quantize_vec(dtype)  \
+  template void quantize(          \
+      dtype* __restrict__ y,       \
+      const float* __restrict__ x, \
+      float inv_scale,             \
+      int32_t zero_point,          \
+      size_t size);
+typed_quantize_vec(int8_t);
+typed_quantize_vec(uint8_t);
+typed_quantize_vec(int16_t);
+typed_quantize_vec(uint16_t);
+typed_quantize_vec(int32_t);
+#undef typed_quantize_vec
+
+#define typed_dequantize_val(dtype) \
+  template float dequantize(const dtype x, float scale, int32_t zero_point);
+typed_dequantize_val(int8_t);
+typed_dequantize_val(uint8_t);
+typed_dequantize_val(int16_t);
+typed_dequantize_val(uint16_t);
+typed_dequantize_val(int32_t);
+#undef typed_dequantize_val
+
+#define typed_dequantize_vec(dtype) \
+  template void dequantize(         \
+      float* __restrict__ y,        \
+      const dtype* __restrict__ x,  \
+      float scale,                  \
+      int32_t zero_point,           \
+      size_t size);
+typed_dequantize_vec(int8_t);
+typed_dequantize_vec(uint8_t);
+typed_dequantize_vec(int16_t);
+typed_dequantize_vec(uint16_t);
+typed_dequantize_vec(int32_t);
+#undef typed_dequantize_vec
+
+#define typed_requantize_val(itype, otype) \
+  template otype requantize(               \
+      const itype in,                      \
+      float in_scale,                      \
+      int32_t in_zero_point,               \
+      float inv_out_scale,                 \
+      int32_t out_zero_point);
+typed_requantize_val(int8_t, int8_t);
+typed_requantize_val(int8_t, uint8_t);
+typed_requantize_val(int8_t, int16_t);
+typed_requantize_val(int8_t, uint16_t);
+typed_requantize_val(uint8_t, int8_t);
+typed_requantize_val(uint8_t, uint8_t);
+typed_requantize_val(uint8_t, int16_t);
+typed_requantize_val(uint8_t, uint16_t);
+typed_requantize_val(int16_t, int8_t);
+typed_requantize_val(int16_t, uint8_t);
+typed_requantize_val(int16_t, int16_t);
+typed_requantize_val(int16_t, uint16_t);
+typed_requantize_val(uint16_t, int8_t);
+typed_requantize_val(uint16_t, uint8_t);
+typed_requantize_val(uint16_t, int16_t);
+typed_requantize_val(uint16_t, uint16_t);
+#undef typed_requantize_val
+
+#define typed_requantize_vec(itype, otype) \
+  template void requantize(                \
+      otype* __restrict__ out,             \
+      const itype* __restrict__ in,        \
+      float in_scale,                      \
+      int32_t in_zero_point,               \
+      float inv_out_scale,                 \
+      int32_t out_zero_point,              \
+      size_t size);
+typed_requantize_vec(int8_t, int8_t);
+typed_requantize_vec(int8_t, uint8_t);
+typed_requantize_vec(int8_t, int16_t);
+typed_requantize_vec(int8_t, uint16_t);
+typed_requantize_vec(uint8_t, int8_t);
+typed_requantize_vec(uint8_t, uint8_t);
+typed_requantize_vec(uint8_t, int16_t);
+typed_requantize_vec(uint8_t, uint16_t);
+typed_requantize_vec(int16_t, int8_t);
+typed_requantize_vec(int16_t, uint8_t);
+typed_requantize_vec(int16_t, int16_t);
+typed_requantize_vec(int16_t, uint16_t);
+typed_requantize_vec(uint16_t, int8_t);
+typed_requantize_vec(uint16_t, uint8_t);
+typed_requantize_vec(uint16_t, int16_t);
+typed_requantize_vec(uint16_t, uint16_t);
+#undef typed_requantize_vec
+
+}; // namespace kernels
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/kernels/kernels.h b/backends/cadence/vision/kernels/kernels.h
new file mode 100644
index 00000000000..e86a36515ec
--- /dev/null
+++ b/backends/cadence/vision/kernels/kernels.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include "inttypes.h"
+#include "stddef.h"
+
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::Result;
+
+namespace impl {
+namespace vision {
+namespace kernels {
+
+void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size);
+
+template <typename T>
+T quantize(const float x, float scale, int32_t zero_point);
+
+template <typename T>
+float dequantize(const T x, float scale, int32_t zero_point);
+
+template <typename T>
+void quantize(
+    T* __restrict__ y,
+    const float* __restrict__ x,
+    float scale,
+    int32_t zero_point,
+    size_t size);
+
+// Deuantize an int8_t/uint8_t/int16_t array to an fp32 array
+template <typename T>
+void dequantize(
+    float* __restrict__ y,
+    const T* __restrict__ x,
+    float scale,
+    int32_t zero_point,
+    size_t size);
+
+template <typename IT, typename OT>
+OT requantize(
+    const IT in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point);
+
+template <typename IT, typename OT>
+void requantize(
+    OT* __restrict__ out,
+    const IT* __restrict__ in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point,
+    size_t size);
+
+}; // namespace kernels
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/kernels/targets.bzl b/backends/cadence/vision/kernels/targets.bzl
new file mode 100644
index 00000000000..02136c872b3
--- /dev/null
+++ b/backends/cadence/vision/kernels/targets.bzl
@@ -0,0 +1,25 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "cadence_kernels",
+        srcs = ["kernels.cpp"],
+        exported_headers = [
+            "kernels.h",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        platforms = CXX,
+        compatible_with = select({
+            "DEFAULT": [],
+            "ovr_config//cpu:xtensa": ["ovr_config//cpu:xtensa"],
+        }),
+        define_static_target = True,
+        deps = [
+            "//executorch/backends/cadence/vision/third-party:vision-nnlib",
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+    )
diff --git a/backends/cadence/vision/operators/CMakeLists.txt b/backends/cadence/vision/operators/CMakeLists.txt
new file mode 100644
index 00000000000..76b784681be
--- /dev/null
+++ b/backends/cadence/vision/operators/CMakeLists.txt
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
+
+if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+endif()
+
+# ATen compliant ops that are needed to run this model.
+set(_aten_ops__srcs
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_add.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_embedding.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_full.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_view_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_softmax.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mean.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_rsqrt.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_transpose_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_eq.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_logical_not.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_any.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/delinearize_index.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp"
+)
+add_library(aten_ops_cadence ${_aten_ops__srcs})
+target_link_libraries(aten_ops_cadence PUBLIC executorch)
+target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories
+    ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
+)
+
+target_include_directories(
+  aten_ops_cadence PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                          ${_common_include_directories}
+)
+
+# Custom ops that are needed to run the test model.
+add_library(
+  custom_ops
+  "op_quantized_linear_out.cpp"
+  "op_quantized_conv_out.cpp"
+  "op_quantized_relu_out.cpp"
+  "op_quantized_layer_norm.cpp"
+  "op_quantize_per_tensor.cpp"
+  "op_quantized_fully_connected_out.cpp"
+  "op_dequantize_per_tensor.cpp"
+  "op_quantized_matmul_out.cpp"
+  "op_requantize_out.cpp"
+  "op_im2row_out.cpp"
+)
+target_include_directories(
+  custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                    ${_common_include_directories}
+)
+
+target_link_libraries(custom_ops PUBLIC executorch)
+target_link_libraries(custom_ops PRIVATE cadence_kernels)
+
+# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
+# Executorch (for runtime). Here select all ops in functions_vision.yaml
+gen_selected_ops(
+  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML
+  "${CMAKE_CURRENT_LIST_DIR}/../../aot/functions_vision.yaml" "" ""
+)
+generate_bindings_for_kernels(
+  LIB_NAME "cadence_ops_lib" OPS_SCHEMA_YAML FUNCTIONS_YAML
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../aot/functions_vision.yaml
+)
+message("Generated cadence x86 files ${gen_command_sources}")
+
+gen_operators_lib(
+  LIB_NAME "cadence_ops_lib" KERNEL_LIBS custom_ops DEPS aten_ops_cadence
+)
diff --git a/backends/cadence/generic/operators/op_add.cpp b/backends/cadence/vision/operators/op_add.cpp
similarity index 72%
rename from backends/cadence/generic/operators/op_add.cpp
rename to backends/cadence/vision/operators/op_add.cpp
index 89b67467605..81014143275 100644
--- a/backends/cadence/generic/operators/op_add.cpp
+++ b/backends/cadence/vision/operators/op_add.cpp
@@ -11,8 +11,18 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
+using executorch::aten::Scalar;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::canCast;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::promoteTypes;
+using torch::executor::apply_binary_elementwise_fn;
+using torch::executor::Error;
+using torch::executor::native::utils::extract_scalar;
+
+namespace impl {
+namespace vision {
 namespace native {
 
 Tensor& add_out(
@@ -23,6 +33,8 @@ Tensor& add_out(
     Tensor& out) {
   (void)ctx;
 
+  using namespace torch::executor::native::utils;
+
   ScalarType a_type = a.scalar_type();
   ScalarType b_type = b.scalar_type();
   ScalarType common_type = promoteTypes(a_type, b_type);
@@ -39,7 +51,9 @@ Tensor& add_out(
   using CTYPE_IN = float;
   using CTYPE_OUT = float;
   CTYPE_IN alpha_val;
-  ET_EXTRACT_SCALAR(alpha, alpha_val);
+  ET_CHECK_MSG(
+      extract_scalar(alpha, &alpha_val),
+      "Could not be extracted: wrong type or out of range");
 
   apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
       [alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
@@ -57,5 +71,5 @@ Tensor& add_out(
 }
 
 } // namespace native
-} // namespace executor
-} // namespace torch
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
new file mode 100644
index 00000000000..daffecda1bf
--- /dev/null
+++ b/backends/cadence/vision/operators/op_dequantize_per_tensor.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+void dequantize_per_tensor_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  float* out_data = out.mutable_data_ptr<float>();
+  size_t numel = out.numel();
+
+  if (input.scalar_type() == ScalarType::Byte) {
+    const uint8_t* input_data = input.const_data_ptr<uint8_t>();
+    kernels::dequantize<uint8_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Char) {
+    const int8_t* input_data = input.const_data_ptr<int8_t>();
+    kernels::dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
+  } else if (
+      input.scalar_type() == ScalarType::Bits16 ||
+      input.scalar_type() == ScalarType::UInt16) {
+    const uint16_t* input_data = input.const_data_ptr<uint16_t>();
+    kernels::dequantize<uint16_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Short) {
+    const int16_t* input_data = input.const_data_ptr<int16_t>();
+    kernels::dequantize<int16_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else if (input.scalar_type() == ScalarType::Int) {
+    const int32_t* input_data = input.const_data_ptr<int32_t>();
+    kernels::dequantize<int32_t>(
+        out_data, input_data, scale, zero_point, numel);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/generic/operators/op_embedding.cpp b/backends/cadence/vision/operators/op_embedding.cpp
similarity index 92%
rename from backends/cadence/generic/operators/op_embedding.cpp
rename to backends/cadence/vision/operators/op_embedding.cpp
index ce28789a156..5273cb083e8 100644
--- a/backends/cadence/generic/operators/op_embedding.cpp
+++ b/backends/cadence/vision/operators/op_embedding.cpp
@@ -8,13 +8,13 @@
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-namespace torch {
-namespace executor {
-namespace native {
-
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 
+namespace impl {
+namespace vision {
+namespace native {
+
 void embedding_out(
     KernelRuntimeContext& ctx,
     const Tensor& weight,
@@ -37,5 +37,5 @@ void embedding_out(
 }
 
 } // namespace native
-} // namespace executor
-} // namespace torch
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_full.cpp b/backends/cadence/vision/operators/op_full.cpp
similarity index 68%
rename from backends/cadence/generic/operators/op_full.cpp
rename to backends/cadence/vision/operators/op_full.cpp
index 21d5fc56299..afc29718a2b 100644
--- a/backends/cadence/generic/operators/op_full.cpp
+++ b/backends/cadence/vision/operators/op_full.cpp
@@ -9,12 +9,18 @@
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-namespace torch {
-namespace executor {
-namespace native {
-
+using executorch::aten::IntArrayRef;
+using executorch::aten::Scalar;
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+using torch::executor::Error;
+using torch::executor::native::utils::extract_scalar;
+using torch::executor::native::utils::get_scalar_dtype;
+
+namespace impl {
+namespace vision {
+namespace native {
 
 Tensor& full_out(
     KernelRuntimeContext& ctx,
@@ -23,7 +29,7 @@ Tensor& full_out(
     Tensor& out) {
   (void)ctx;
 
-  ScalarType val_type = utils::get_scalar_dtype(fill_value);
+  ScalarType val_type = get_scalar_dtype(fill_value);
   ScalarType out_type = out.scalar_type();
 
   Error err = resize_tensor(out, sizes);
@@ -31,7 +37,9 @@ Tensor& full_out(
 
   ET_SWITCH_REAL_TYPES_AND(Bool, val_type, ctx, "full", CTYPE_VAL, [&] {
     CTYPE_VAL val;
-    ET_EXTRACT_SCALAR(fill_value, val);
+    ET_CHECK_MSG(
+        extract_scalar(fill_value, &val),
+        "Could not be extracted: wrong type or out of range");
 
     ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "full", CTYPE_OUT, [&] {
       CTYPE_OUT val_casted = static_cast<CTYPE_OUT>(val);
@@ -46,5 +54,5 @@ Tensor& full_out(
 }
 
 } // namespace native
-} // namespace executor
-} // namespace torch
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_im2row_out.cpp b/backends/cadence/vision/operators/op_im2row_out.cpp
new file mode 100644
index 00000000000..501f8ce5376
--- /dev/null
+++ b/backends/cadence/vision/operators/op_im2row_out.cpp
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/operators/operators.h>
+
+#include <algorithm>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+template <typename T>
+__attribute__((always_inline)) void im2row_(
+    const T* __restrict__ data_im,
+    const int32_t in_zero_point,
+    /* input parameters*/
+    const int32_t channels,
+    const int32_t height,
+    const int32_t width,
+    /* output parameters */
+    const int32_t out_height,
+    const int32_t out_width,
+    /* convolution parameters */
+    const int32_t kernel_h,
+    const int32_t kernel_w,
+    const int32_t pad_h,
+    const int32_t pad_w,
+    const int32_t stride_h,
+    const int32_t stride_w,
+    const int32_t dilation_h,
+    const int32_t dilation_w,
+    T* __restrict__ data_col,
+    bool channels_last) {
+  // Consider convolving the input image of dimensions channels * height * width
+  // (or height * width * channels for NHWC layout) with a filter of dimensions
+  // channels * kernels_h * kernels_w. Assume that this convolution will produce
+  // an output of dimensinos out_height x out_width. For each point the output,
+  // im2row takes the data from the input that is used in the computation of
+  // that output point, and flattens it into a vector of size channels_col =
+  // channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D
+  // array of size (out_height * out_width) x channels_col
+  const int32_t channels_col = channels * kernel_h * kernel_w;
+
+  // If the layout is NHWC, we can copy 'channels' worth of contiguous data
+  // points when performing im2row.
+  if (channels_last) {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+        // Each point in the output domain is the result of applying a filter of
+        // size kernel_h x kernel_w x channels on the input. But since channels
+        // is contiguous, we will not explicitly have a loop for it.
+        for (int _kh = 0; _kh < kernel_h; ++_kh) {
+          int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+          for (int _kw = 0; _kw < kernel_w; ++_kw) {
+            int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+
+            // h_im and w_im are the actual height and width coordinates of the
+            // input tensor from where we need to copy 'channels' points.
+            const T* __restrict__ slice_im =
+                data_im + (h_im * width + w_im) * channels;
+            T* __restrict__ slice_col = data_col + i_col * channels_col +
+                (_kh * kernel_w + _kw) * channels;
+            // If the coordinates were within the input domain, we copy
+            // 'channels' contiguous values. Otherwise we will fill the output
+            // with 0's.
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              std::memcpy(slice_col, slice_im, channels * sizeof(T));
+            } else {
+              std::fill_n(slice_col, channels, T(in_zero_point));
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+
+        // Each point in the output domain is the result of applying a filter
+        // of size chanenls * kernel_h x kernel_w on the input
+        for (int _c = 0; _c < channels; ++_c) {
+          for (int _kh = 0; _kh < kernel_h; ++_kh) {
+            for (int _kw = 0; _kw < kernel_w; ++_kw) {
+              // c_col is the linearized access in the channels_col vector.
+              int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
+              // h_im and w_im are the actual height and width coordinates of
+              // the input tensor that we need to copy to the output.
+              int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+              int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+              // If the current data access is within the input tensor, copy the
+              // value
+              data_col[i_col * channels_col + c_col] =
+                  (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                  ? data_im[(_c * height + h_im) * width + w_im]
+                  : static_cast<T>(in_zero_point);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void im2row_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    const Tensor& in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+
+  // Check if the input is per-tensor quantized or per-channel quantized. The
+  // zero point for each batch could differ for per-channel quantized input.
+  bool per_tensor_quantized = in_zero_point.numel() == 1;
+
+#define typed_im2row(dtype, ctype)                                     \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    const int32_t* __restrict__ zero_point =                           \
+        in_zero_point.const_data_ptr<int32_t>();                       \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          per_tensor_quantized ? zero_point[0] : zero_point[n],        \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    typed_im2row(Float, float);
+    typed_im2row(Byte, uint8_t);
+    typed_im2row(Char, int8_t);
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "im2row not implemented for dtype %s",
+          torch::executor::toString(dtype));
+  }
+#undef typed_im2row
+}
+
+void im2row_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    int64_t in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+
+#define typed_im2row_per_tensor(dtype, ctype)                          \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          in_zero_point,                                               \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    typed_im2row_per_tensor(Float, float);
+    typed_im2row_per_tensor(Byte, uint8_t);
+    typed_im2row_per_tensor(Char, int8_t);
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "im2row.per_tensor not implemented for dtype %s",
+          torch::executor::toString(dtype));
+  }
+#undef typed_im2row_per_tensor
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantize_per_tensor.cpp b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
new file mode 100644
index 00000000000..cd72d2de2b5
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantize_per_tensor.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+// Quantize the input tensor (PT2 version). Note that quant_<min,max> are not
+// used in any computation.
+void quantize_per_tensor_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    double scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    ScalarType dtype,
+    Tensor& out) {
+  const float* input_data = input.const_data_ptr<float>();
+  size_t numel = out.numel();
+
+  if (out.scalar_type() == ScalarType::Byte) {
+    uint8_t* out_data = out.mutable_data_ptr<uint8_t>();
+    kernels::quantize<uint8_t>(
+        out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Char) {
+    int8_t* out_data = out.mutable_data_ptr<int8_t>();
+    kernels::quantize<int8_t>(
+        out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (
+      out.scalar_type() == ScalarType::Bits16 ||
+      out.scalar_type() == ScalarType::UInt16) {
+    uint16_t* out_data = out.mutable_data_ptr<uint16_t>();
+    kernels::quantize<uint16_t>(
+        out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Short) {
+    int16_t* out_data = out.mutable_data_ptr<int16_t>();
+    kernels::quantize<int16_t>(
+        out_data, input_data, 1. / scale, zero_point, numel);
+  } else if (out.scalar_type() == ScalarType::Int) {
+    int32_t* out_data = out.mutable_data_ptr<int32_t>();
+    kernels::quantize<int32_t>(
+        out_data, input_data, 1. / scale, zero_point, numel);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(out.scalar_type()));
+  }
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_conv_out.cpp b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
new file mode 100644
index 00000000000..1e1e6c8cdc7
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantized_conv_out.cpp
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/backends/cadence/vision/operators/operators.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// This implements a generic 2d conv kernel that operates on raw pointers.
+// The version handles both quantized and fp32 convolutions.
+// The input is of shape [n x c x h x w]
+// The weight is of shape [oc x wc x wh x ww], where wc == c
+// The output is of shape [n x oc x oh x ow]
+// The bias is of shape [oc]
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv2d_nchw_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t c,
+    int32_t h,
+    int32_t w,
+    int32_t oc,
+    int32_t wc,
+    int32_t wh,
+    int32_t ww,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * c * h * w;
+    OT* out_batch = p_out + _n * oc * oh * ow;
+    // Compute separable convolution for each group
+    for (int _g = 0; _g < groups; ++_g) {
+      // Identify the input and output channels involved in the computation
+      // of this group
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      // Populate all the output channels in the group
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        OT* out_plane = out_batch + _oc * oh * ow;
+        const WT* weight_batch = p_weight + _oc * wc * wh * ww;
+        // We compute one output channel at a time. The computation can be
+        // thought of as a stencil computation: we iterate over an input of size
+        // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an
+        // output channel of size 1 x oh x ow.
+        for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+          for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to the
+            // output channel being computed) with the corresponding weight
+            // channel.
+            // If the padding is 0, and dilation is 1, then we can remove the
+            // unnecessary checks, and simplify the code so that it can be
+            // vectorized by Tensilica compiler.
+            if (zero_pad_unit_dilation) {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    int ioff = (_h + _wh) * w + (_w + _ww);
+                    int woff = _wh * ww + _ww;
+                    float lhs = in_plane[ioff] - in_zero_point;
+                    float rhs = weight_plane[woff] -
+                        (quantized ? weight_zero_point : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    if (((_h + d0 * _wh - p0) >= 0) &&
+                        ((_h + d0 * _wh - p0) < h) &&
+                        ((_w + d1 * _ww - p1) >= 0) &&
+                        ((_w + d1 * _ww - p1) < w)) {
+                      int ioff =
+                          (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
+                      int woff = _wh * ww + _ww;
+                      float lhs = in_plane[ioff] - in_zero_point;
+                      float rhs = weight_plane[woff] -
+                          (quantized ? weight_zero_point : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val = bias_scale * acc;
+              out_plane[_oh * ow + _ow] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+            } else {
+              out_plane[_oh * ow + _ow] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv2d_nhwc_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t h,
+    int32_t w,
+    int32_t c,
+    int32_t oc,
+    int32_t wh,
+    int32_t ww,
+    int32_t wc,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * h * w * c;
+    OT* out_batch = p_out + _n * oh * ow * oc;
+    for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+      for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+        OT* out_line = out_batch + (_oh * ow + _ow) * oc;
+        // Compute separable convolution for each group
+        for (int _g = 0; _g < groups; ++_g) {
+          // Identify the input and output channels involved in the computation
+          // of this group
+          int sic = _g * icpg;
+          int soc = _g * ocpg;
+          // Populate all the output channels in the group
+          for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+            const WT* weight_batch = p_weight + _oc * wh * ww * wc;
+            // We compute one output channel at a time. The computation can be
+            // thought of as a stencil computation: we iterate over an input of
+            // size h x w x icpg, with a stencil of size wh x ww x icpg, to
+            // compute an output channel of size oh x ow x 1.
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to
+            // the output channel being computed) with the corresponding
+            // weight channel. If the padding is 0, and dilation is 1, then
+            // we can remove the unnecessary checks, and simplify the code
+            // so that it can be vectorized by Tensilica compiler.x``
+            if (zero_pad_unit_dilation) {
+              for (int _wh = 0; _wh < wh; ++_wh) {
+                for (int _ww = 0; _ww < ww; ++_ww) {
+                  const IT* in_line =
+                      in_batch + (_h + _wh) * w * c + (_w + _ww) * c;
+                  const WT* weight_line =
+                      weight_batch + _wh * ww * wc + _ww * wc;
+                  for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                    float lhs = in_line[_ic] - in_zero_point;
+                    float rhs = weight_line[_ic - sic] -
+                        (quantized ? weight_zero_point : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _wh = 0; _wh < wh; ++_wh) {
+                for (int _ww = 0; _ww < ww; ++_ww) {
+                  if (((_h + d0 * _wh - p0) >= 0) &&
+                      ((_h + d0 * _wh - p0) < h) &&
+                      ((_w + d1 * _ww - p1) >= 0) &&
+                      ((_w + d1 * _ww - p1 < w))) {
+                    const IT* in_line = in_batch +
+                        (_h + d0 * _wh - p0) * w * c + (_w + d1 * _ww - p1) * c;
+                    const WT* weight_line =
+                        weight_batch + _wh * ww * wc + _ww * wc;
+                    for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                      float lhs = in_line[_ic] - in_zero_point;
+                      float rhs = weight_line[_ic - sic] -
+                          (quantized ? weight_zero_point : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val = bias_scale * acc;
+              out_line[_oc] =
+                  kernels::quantize<OT>(val, inv_out_scale, out_zero_point);
+            } else {
+              out_line[_oc] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// The quantized convolution kernel. in_scale and weight_scale are implicit in
+// bias_scale, since it is a product of the two. The kernel will branch to
+// quantized::conv1d or quantized::conv2d based on the dimensionality of
+// activation tensor.
+void quantized_conv_nchw(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, c, h, w]
+  const int n = input.size(0);
+  const int c = input.size(1);
+  const int h = conv1d ? 1 : input.size(2);
+  const int w = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wc, wh, ww]
+  const int oc = weight.size(0);
+  const int wc = weight.size(1);
+  const int wh = conv1d ? 1 : weight.size(2);
+  const int ww = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oc, oh, ow]
+  const int oh = conv1d ? 1 : out.size(2);
+  const int ow = conv1d ? out.size(2) : out.size(3);
+
+#define typed_quantized_conv2d_nchw(ctype, dtype)                 \
+  case ScalarType::dtype: {                                       \
+    conv2d_nchw_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                            \
+        weight.const_data_ptr<ctype>(),                           \
+        bias.const_data_ptr<int32_t>(),                           \
+        out.mutable_data_ptr<ctype>(),                            \
+        n,                                                        \
+        c,                                                        \
+        h,                                                        \
+        w,                                                        \
+        oc,                                                       \
+        wc,                                                       \
+        wh,                                                       \
+        ww,                                                       \
+        oh,                                                       \
+        ow,                                                       \
+        stride[0],                                                \
+        stride[1],                                                \
+        padding[0],                                               \
+        padding[1],                                               \
+        dilation[0],                                              \
+        dilation[1],                                              \
+        groups,                                                   \
+        in_zero_point,                                            \
+        weight_zero_point,                                        \
+        bias_scale,                                               \
+        output_scale,                                             \
+        (ctype)output_zero_point);                                \
+    break;                                                        \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nchw);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv2d_nchw
+}
+
+void quantized_conv_nhwc(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, h, w, c]
+  const int n = input.size(0);
+  const int h = conv1d ? 1 : input.size(1);
+  const int w = conv1d ? input.size(1) : input.size(2);
+  const int c = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wh, ww, wc]
+  const int oc = weight.size(0);
+  const int wh = conv1d ? 1 : weight.size(1);
+  const int ww = conv1d ? weight.size(1) : weight.size(2);
+  const int wc = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oh, ow, oc]
+  const int oh = conv1d ? 1 : out.size(1);
+  const int ow = conv1d ? out.size(1) : out.size(2);
+
+#define typed_quantized_conv2d_nhwc(ctype, dtype)                 \
+  case ScalarType::dtype: {                                       \
+    conv2d_nhwc_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                            \
+        weight.const_data_ptr<ctype>(),                           \
+        bias.const_data_ptr<int32_t>(),                           \
+        out.mutable_data_ptr<ctype>(),                            \
+        n,                                                        \
+        h,                                                        \
+        w,                                                        \
+        c,                                                        \
+        oc,                                                       \
+        wh,                                                       \
+        ww,                                                       \
+        wc,                                                       \
+        oh,                                                       \
+        ow,                                                       \
+        stride[0],                                                \
+        stride[1],                                                \
+        padding[0],                                               \
+        padding[1],                                               \
+        dilation[0],                                              \
+        dilation[1],                                              \
+        groups,                                                   \
+        in_zero_point,                                            \
+        weight_zero_point,                                        \
+        bias_scale,                                               \
+        output_scale,                                             \
+        (ctype)output_zero_point);                                \
+    break;                                                        \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv2d_nhwc);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv2d_nhwc
+}
+
+void quantized_conv_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED const Tensor& out_multiplier,
+    __ET_UNUSED const Tensor& out_shift,
+    bool channel_last,
+    Tensor& out) {
+  const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
+  const int32_t weight_zero_point_int =
+      weight_zero_point.const_data_ptr<int32_t>()[0];
+  if (channel_last) {
+    quantized_conv_nhwc(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point_int,
+        bias_scale_float,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    quantized_conv_nchw(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point_int,
+        bias_scale_float,
+        output_scale,
+        output_zero_point,
+        out);
+  }
+}
+
+void quantized_conv_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    bool channel_last,
+    Tensor& out) {
+  if (channel_last) {
+    quantized_conv_nhwc(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    quantized_conv_nchw(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out);
+  }
+}
+
+void quantized_conv2d_nchw_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      false, // channel_last = false for NCHW
+      out);
+}
+
+void quantized_conv2d_nhwc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    Tensor& out) {
+  quantized_conv_per_tensor_out(
+      ctx,
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out_multiplier,
+      out_shift,
+      true, // channel_last = true for NHWC
+      out);
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
new file mode 100644
index 00000000000..29aa8906414
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantized_fully_connected_out.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/backends/cadence/vision/operators/operators.h>
+#include <executorch/backends/cadence/vision/operators/quantized_ops.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using ::executorch::aten::optional;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void quantized_fully_connected_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point_t,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+#define typed_quantized_linear(ctype, dtype) \
+  case ScalarType::dtype: {                  \
+    quantized_linear_<ctype>(                \
+        in,                                  \
+        weight,                              \
+        bias,                                \
+        in_zero_point,                       \
+        weight_zero_point_t,                 \
+        out_multiplier,                      \
+        out_shift,                           \
+        out_zero_point,                      \
+        out);                                \
+    break;                                   \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_linear
+}
+
+void quantized_fully_connected_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const optional<Tensor>& offset,
+    Tensor& out) {
+#define typed_quantized_linear(ctype, dtype) \
+  case ScalarType::dtype: {                  \
+    quantized_linear_per_tensor_<ctype>(     \
+        in,                                  \
+        weight,                              \
+        bias,                                \
+        in_zero_point,                       \
+        weight_zero_point,                   \
+        out_multiplier,                      \
+        out_shift,                           \
+        out_zero_point,                      \
+        out);                                \
+    break;                                   \
+  }
+
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+#undef typed_quantized_linear
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_layer_norm.cpp b/backends/cadence/vision/operators/op_quantized_layer_norm.cpp
new file mode 100644
index 00000000000..a9685eddedb
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantized_layer_norm.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <cmath>
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace impl {
+namespace vision {
+namespace native {
+
+// Compute quantized layer_norm. The current implementation assumes that the
+// input is per-tensor quantized.
+template <typename T>
+void quantized_layer_norm_per_tensor_(
+    const Tensor& input,
+    double input_scale,
+    int64_t input_zero_point,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  // Get the raw pointers to input, output, weight, and bias
+  const T* __restrict__ in_data = input.const_data_ptr<T>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+  const float* __restrict__ weight_data = weight.const_data_ptr<float>();
+  const float* __restrict__ bias_data = bias.const_data_ptr<float>();
+
+  float output_inv_scale = 1.0f / output_scale;
+
+  size_t last_dim = input.size(input.dim() - 1);
+  size_t leading_dims = getLeadingDims(input, input.dim() - 1);
+
+  // Visualize the input tensor as a set of 1d vectors, and compute the
+  // layer_norm for each vector.
+  for (size_t i = 0; i < leading_dims; ++i) {
+    const T* x = in_data + i * last_dim;
+    T* y = out_data + i * last_dim;
+
+    // compute sum and squared sum. The fp32 sum can be approximated as:
+    // (X_1 - in_zero_point) * in_scale + (X_2 - in_zero_point) * in_scale + ...
+    // (X_N - in_zero_point) * in_scale.
+    int32_t sum = 0;
+    int32_t sq_sum = last_dim * input_zero_point * input_zero_point;
+    for (size_t j = 0; j < last_dim; ++j) {
+      int32_t val = x[j];
+      sum += val;
+      sq_sum += val * val;
+    }
+    sq_sum -= (2 * sum * input_zero_point);
+    sum -= (last_dim * input_zero_point);
+
+    float mean = (input_scale * sum) / last_dim;
+    float variance =
+        (sq_sum * input_scale * input_scale) / last_dim - mean * mean;
+    float inv_std = 1.0f / std::sqrt(variance + eps);
+
+    // y = (x - mean) / std * kGamma + kBeta
+    for (int j = 0; j < last_dim; ++j) {
+      // y[j] = (x[j] - mean) / std * kGamma + kBeta;
+      // Since X is quantized, we dequantize it, compute fp32 result, and
+      // quantize the result to an int8/uint8 value.
+      float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+
+      val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
+      y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
+    }
+  }
+}
+
+// Compute quantized layer_norm. The current implementation assumes that the
+// input is per-tensor quantized.
+template <typename T>
+void quantized_layer_norm_(
+    const Tensor& input,
+    const Tensor& in_scale,
+    const Tensor& in_zero_point,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  // Extract the zero point and scale for input tensor.
+  float input_scale = in_scale.const_data_ptr<float>()[0];
+  int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
+
+  // Call other overload
+  quantized_layer_norm_per_tensor_<T>(
+      input,
+      input_scale,
+      input_zero_point,
+      weight,
+      bias,
+      eps,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_layer_norm_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_scale,
+    const Tensor& in_zero_point,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_layer_norm_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_layer_norm_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_layer_norm_per_tensor_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_layer_norm_per_tensor_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_linear_out.cpp b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
new file mode 100644
index 00000000000..b6b7cdd17bc
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantized_linear_out.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/operators/operators.h>
+#include <executorch/backends/cadence/vision/operators/quantized_ops.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
+using executorch::runtime::KernelRuntimeContext;
+
+template <typename T>
+void inline _typed_quantized_linear(
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t src_zero_point,
+    const Tensor& weight_zero_point_t,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    Tensor& out) {
+  const T* __restrict__ src_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+
+  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
+
+  // input comes in shape [batch_size, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [batch_size, out_dim]
+  // Perform matrix multiply (M x N) x (N x P) => M x P
+  const auto M = weight.size(0); // = out_dim
+  const auto N = weight.size(1); // = in_dim
+
+  // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
+  // leading dimensions is d0 * d1 * ... * d_{N-2}
+  const auto leading_dims = getLeadingDims(src, src.dim() - 1);
+
+  ET_CHECK_MSG(
+      out_multiplier.numel() == 1, "out_multiplier should have one element");
+  ET_CHECK_MSG(
+      out_shift.numel() == 1, "out_multiplier should have one element");
+
+  const int32_t* __restrict__ out_multiplier_data =
+      out_multiplier.const_data_ptr<int32_t>();
+  const int32_t* __restrict__ out_shift_data =
+      out_shift.const_data_ptr<int32_t>();
+
+  // Compute the out_scale from out_multiplier and out_shift
+  const float out_scale =
+      -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
+
+  for (int i = 0; i < leading_dims; ++i) {
+    for (int j = 0; j < M; ++j) {
+      float sum = bias_data[j];
+      for (int k = 0; k < N; ++k) {
+        sum += (src_data[i * N + k] - src_zero_point) *
+            (weight_data[j * N + k] - weight_zero_point);
+      }
+      out_data[i * M + j] =
+          kernels::quantize<T>(sum, out_scale, out_zero_point);
+    }
+  }
+}
+
+void quantized_linear_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t src_zero_point,
+    const Tensor& weight_zero_point_t,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
+    Tensor& out) {
+  // TODO: refactor to use switch case as quantized_linear_per_tensor_out
+  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
+    _typed_quantized_linear<uint8_t>(
+        src,
+        weight,
+        bias,
+        src_zero_point,
+        weight_zero_point_t,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        out);
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Char) {
+    _typed_quantized_linear<int8_t>(
+        src,
+        weight,
+        bias,
+        src_zero_point,
+        weight_zero_point_t,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(src.scalar_type()));
+  }
+}
+
+void quantized_linear_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
+    Tensor& out) {
+#define typed_quantized_linear_per_tensor(ctype, dtype) \
+  case executorch::aten::ScalarType::dtype: {           \
+    quantized_linear_per_tensor_<ctype>(                \
+        src,                                            \
+        weight,                                         \
+        bias,                                           \
+        src_zero_point,                                 \
+        weight_zero_point,                              \
+        out_multiplier,                                 \
+        out_shift,                                      \
+        out_zero_point,                                 \
+        out);                                           \
+    break;                                              \
+  }
+
+  executorch::aten::ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear_per_tensor);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", executorch::runtime::toString(dtype));
+  }
+#undef typed_quantized_linear_per_tensor
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_matmul_out.cpp b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
new file mode 100644
index 00000000000..54a303288c3
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantized_matmul_out.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using executorch::aten::Tensor;
+using executorch::runtime::getLeadingDims;
+using executorch::runtime::KernelRuntimeContext;
+
+// The quantized matmul. The quantized matmul accumulates in a wider register,
+// whose type is TA.
+template <
+    typename TZ,
+    typename TA = float,
+    bool transposed = false,
+    typename TX = TZ,
+    typename TY = TZ>
+__attribute__((noinline)) void qmatmul(
+    TZ* __restrict__ Z,
+    int32_t Z_multiplier,
+    int32_t Z_shift,
+    int32_t Z_zero_point,
+    const TX* __restrict__ X,
+    int32_t X_zero_point,
+    const TY* __restrict__ y,
+    int32_t Y_zero_point,
+    size_t m,
+    size_t n,
+    size_t p) {
+  // Compute the Z_scale from Z_multiplier and Z_shift
+  const float Z_scale = -Z_multiplier * 1.0 / (1 << 31) * pow(2, Z_shift);
+  for (size_t i = 0; i < m; ++i) {
+    for (size_t j = 0; j < p; ++j) {
+      TA sum = 0;
+      for (size_t k = 0; k < n; ++k) {
+        if (transposed) {
+          sum += (X[i * n + k] - X_zero_point) * (y[j * n + k] - Y_zero_point);
+        } else {
+          sum += (X[i * n + k] - X_zero_point) * (y[k * p + j] - Y_zero_point);
+        }
+      }
+      Z[i * p + j] = kernels::quantize<TZ>(sum, Z_scale, Z_zero_point);
+    }
+  }
+}
+
+template <typename T>
+void inline _typed_quantized_matmul(
+    const Tensor& X,
+    int64_t X_zero_point,
+    const Tensor& Y,
+    int64_t Y_zero_point,
+    const executorch::aten::optional<Tensor>& bias,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    bool transposed,
+    Tensor& out) {
+  size_t batch_size = getLeadingDims(X, X.dim() - 2);
+  size_t leading_dim = X.size(X.dim() - 2);
+  size_t out_dim = Y.size(Y.dim() - 1 - transposed);
+  size_t in_dim = X.size(X.dim() - 1);
+
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+  const T* __restrict__ X_data = X.const_data_ptr<T>();
+  const T* __restrict__ Y_data = Y.const_data_ptr<T>();
+  for (size_t i = 0; i < batch_size; ++i) {
+    const T* x = X_data + i * leading_dim * in_dim;
+    const T* y = Y_data + i * in_dim * out_dim;
+    T* z = out_data + i * leading_dim * out_dim;
+    if (transposed) {
+      qmatmul<T, int32_t, true>(
+          z,
+          static_cast<int32_t>(out_multiplier),
+          static_cast<int32_t>(out_shift),
+          static_cast<int32_t>(out_zero_point),
+          x,
+          static_cast<int32_t>(X_zero_point),
+          y,
+          static_cast<int32_t>(Y_zero_point),
+          leading_dim,
+          in_dim,
+          out_dim);
+    } else {
+      qmatmul<T, int32_t, false>(
+          z,
+          static_cast<int32_t>(out_multiplier),
+          static_cast<int32_t>(out_shift),
+          static_cast<int32_t>(out_zero_point),
+          x,
+          static_cast<int32_t>(X_zero_point),
+          y,
+          static_cast<int32_t>(Y_zero_point),
+          leading_dim,
+          in_dim,
+          out_dim);
+    }
+  }
+}
+
+void quantized_matmul_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& X,
+    int64_t X_zero_point,
+    const Tensor& Y,
+    int64_t Y_zero_point,
+    const executorch::aten::optional<Tensor>& bias,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    bool transposed,
+    Tensor& out) {
+  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
+    _typed_quantized_matmul<uint8_t>(
+        X,
+        X_zero_point,
+        Y,
+        Y_zero_point,
+        bias,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        transposed,
+        out);
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Char) {
+    _typed_quantized_matmul<int8_t>(
+        X,
+        X_zero_point,
+        Y,
+        Y_zero_point,
+        bias,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        transposed,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(X.scalar_type()));
+  }
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/operators/op_quantized_relu_out.cpp b/backends/cadence/vision/operators/op_quantized_relu_out.cpp
new file mode 100644
index 00000000000..45b9e09b1dd
--- /dev/null
+++ b/backends/cadence/vision/operators/op_quantized_relu_out.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/backends/cadence/vision/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+template <typename T>
+void quantized_relu_(
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    const int64_t out_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& output) {
+  T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
+  const T* __restrict__ in = input.const_data_ptr<T>();
+  T* __restrict__ out = output.mutable_data_ptr<T>();
+
+  const int32_t* __restrict__ out_multiplier_data =
+      out_multiplier.const_data_ptr<int32_t>();
+  const int32_t* __restrict__ out_shift_data =
+      out_shift.const_data_ptr<int32_t>();
+
+  // Compute the out_scale from out_multiplier and out_shift
+  const float out_scale =
+      -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
+
+  for (size_t i = 0, e = input.numel(); i < e; ++i) {
+    const T temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
+    out[i] = kernels::quantize<T>(temp, out_scale, out_zero_point);
+  }
+}
+
+void quantized_relu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_zero_point,
+    const int64_t out_zero_point,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    Tensor& output) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_relu_<uint8_t>(
+        input,
+        in_zero_point,
+        out_zero_point,
+        out_multiplier,
+        out_shift,
+        output);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_relu_<int8_t>(
+        input,
+        in_zero_point,
+        out_zero_point,
+        out_multiplier,
+        out_shift,
+        output);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
+template <typename T>
+void quantized_relu_per_tensor_out_(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+  const T* __restrict__ in = input.const_data_ptr<T>();
+  T* __restrict__ out = output.mutable_data_ptr<T>();
+
+  // Compute the out_scale from out_multiplier and out_shift
+  const float out_scale = -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
+
+  for (size_t i = 0, e = input.numel(); i < e; ++i) {
+    const float temp = in[i] > in_zero_point ? (in[i] - in_zero_point) : 0;
+    out[i] = kernels::quantize<T>(temp, out_scale, out_zero_point);
+  }
+}
+
+void quantized_relu_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const int64_t in_zero_point,
+    const int64_t out_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    Tensor& output) {
+#define typed_quantized_relu(ctype, dtype)    \
+  case executorch::aten::ScalarType::dtype: { \
+    quantized_relu_per_tensor_out_<ctype>(    \
+        ctx,                                  \
+        input,                                \
+        in_zero_point,                        \
+        out_zero_point,                       \
+        out_multiplier,                       \
+        out_shift,                            \
+        output);                              \
+    break;                                    \
+  }
+
+  executorch::aten::ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_relu)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_relu
+}
+
+}; // namespace native
+}; // namespace vision
+}; // namespace impl
diff --git a/backends/cadence/vision/operators/op_requantize_out.cpp b/backends/cadence/vision/operators/op_requantize_out.cpp
new file mode 100644
index 00000000000..ef538bf4045
--- /dev/null
+++ b/backends/cadence/vision/operators/op_requantize_out.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+
+// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
+// The scale and zero_point for requantization are in the args.
+Tensor& requantize_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& in_scale_t,
+    const Tensor& in_zero_point_t,
+    const Tensor& out_scale_t,
+    const Tensor& out_zero_point_t,
+    const ScalarType out_dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      in_scale_t.scalar_type() == ScalarType::Float,
+      InvalidArgument,
+      out,
+      "In scale is not a float: %s",
+      torch::executor::toString(in_scale_t.scalar_type()));
+  float in_scale = in_scale_t.const_data_ptr<float>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      in_zero_point_t.scalar_type() == ScalarType::Int,
+      InvalidArgument,
+      out,
+      "In zero point is not an int: %s",
+      torch::executor::toString(in_zero_point_t.scalar_type()));
+  int32_t in_zero_point = in_zero_point_t.const_data_ptr<int32_t>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_scale_t.scalar_type() == ScalarType::Float,
+      InvalidArgument,
+      out,
+      "Out scale is not a float: %s",
+      torch::executor::toString(out_scale_t.scalar_type()));
+  float out_scale = out_scale_t.const_data_ptr<float>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_zero_point_t.scalar_type() == ScalarType::Int,
+      InvalidArgument,
+      out,
+      "Out zero point is not an int: %s",
+      torch::executor::toString(out_zero_point_t.scalar_type()));
+  int32_t out_zero_point = out_zero_point_t.const_data_ptr<int32_t>()[0];
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.scalar_type() == out_dtype,
+      InvalidArgument,
+      out,
+      "Out tensor dtype (%s) does not match the passed in out dtype (%s)",
+      torch::executor::toString(out.scalar_type()),
+      torch::executor::toString(out_dtype));
+
+  const size_t numel = out.numel();
+  ScalarType in_dtype = input.scalar_type();
+
+  // Assert that the output tensor's dtype is same as out_dtype.
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_dtype == out.scalar_type(),
+      InvalidArgument,
+      out,
+      "Out dtype %s does not match requant dtype %s",
+      torch::executor::toString(out.scalar_type()),
+      torch::executor::toString(out_dtype));
+
+#define typed_requantize(ctype, dtype)                     \
+  const ctype* input_data = input.const_data_ptr<ctype>(); \
+  dtype* out_data = out.mutable_data_ptr<dtype>();         \
+  kernels::requantize<ctype, dtype>(                       \
+      out_data,                                            \
+      input_data,                                          \
+      in_scale,                                            \
+      in_zero_point,                                       \
+      1.0 / out_scale,                                     \
+      out_zero_point,                                      \
+      numel);
+
+#define typed_requantize_in(ctype)               \
+  switch (out_dtype) {                           \
+    case ScalarType::Byte: {                     \
+      typed_requantize(ctype, uint8_t);          \
+      break;                                     \
+    }                                            \
+    case ScalarType::Char: {                     \
+      typed_requantize(ctype, int8_t);           \
+      break;                                     \
+    }                                            \
+    case ScalarType::UInt16: {                   \
+      typed_requantize(ctype, uint16_t);         \
+      break;                                     \
+    }                                            \
+    case ScalarType::Short: {                    \
+      typed_requantize(ctype, int16_t);          \
+      break;                                     \
+    }                                            \
+    default:                                     \
+      ET_KERNEL_CHECK_MSG(                       \
+          ctx,                                   \
+          false,                                 \
+          InvalidArgument,                       \
+          out,                                   \
+          "Unhandled output dtype %s",           \
+          torch::executor::toString(out_dtype)); \
+  }
+
+  switch (in_dtype) {
+    case ScalarType::Byte: {
+      typed_requantize_in(uint8_t);
+      break;
+    }
+    case ScalarType::Char: {
+      typed_requantize_in(int8_t);
+      break;
+    }
+    case ScalarType::UInt16: {
+      typed_requantize_in(uint16_t);
+      break;
+    }
+    case ScalarType::Short: {
+      typed_requantize_in(int16_t);
+      break;
+    }
+    default:
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          false,
+          InvalidArgument,
+          out,
+          "Unhandled input dtype %s",
+          torch::executor::toString(in_dtype));
+  }
+#undef typed_requantize_in
+#undef typed_requantize
+  return out;
+}
+
+// Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
+// The scale and zero_point for requantization are in the args.
+Tensor& requantize_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    double out_scale,
+    int64_t out_zero_point,
+    const ScalarType out_dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out.scalar_type() == out_dtype,
+      InvalidArgument,
+      out,
+      "Out tensor dtype (%s) does not match the passed in out dtype (%s)",
+      torch::executor::toString(out.scalar_type()),
+      torch::executor::toString(out_dtype));
+
+  const size_t numel = out.numel();
+  ScalarType in_dtype = input.scalar_type();
+
+  // Assert that the output tensor's dtype is same as out_dtype.
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      out_dtype == out.scalar_type(),
+      InvalidArgument,
+      out,
+      "Out dtype %s does not match requant dtype %s",
+      torch::executor::toString(out.scalar_type()),
+      torch::executor::toString(out_dtype));
+
+#define typed_requantize(ctype, dtype)                     \
+  const ctype* input_data = input.const_data_ptr<ctype>(); \
+  dtype* out_data = out.mutable_data_ptr<dtype>();         \
+  kernels::requantize<ctype, dtype>(                       \
+      out_data,                                            \
+      input_data,                                          \
+      static_cast<float>(in_scale),                        \
+      static_cast<int32_t>(in_zero_point),                 \
+      1.0 / static_cast<float>(out_scale),                 \
+      static_cast<int32_t>(out_zero_point),                \
+      numel);
+
+#define typed_requantize_in(ctype)               \
+  switch (out_dtype) {                           \
+    case ScalarType::Byte: {                     \
+      typed_requantize(ctype, uint8_t);          \
+      break;                                     \
+    }                                            \
+    case ScalarType::Char: {                     \
+      typed_requantize(ctype, int8_t);           \
+      break;                                     \
+    }                                            \
+    case ScalarType::UInt16: {                   \
+      typed_requantize(ctype, uint16_t);         \
+      break;                                     \
+    }                                            \
+    case ScalarType::Short: {                    \
+      typed_requantize(ctype, int16_t);          \
+      break;                                     \
+    }                                            \
+    default:                                     \
+      ET_KERNEL_CHECK_MSG(                       \
+          ctx,                                   \
+          false,                                 \
+          InvalidArgument,                       \
+          out,                                   \
+          "Unhandled output dtype %s",           \
+          torch::executor::toString(out_dtype)); \
+  }
+
+  switch (in_dtype) {
+    case ScalarType::Byte: {
+      typed_requantize_in(uint8_t);
+      break;
+    }
+    case ScalarType::Char: {
+      typed_requantize_in(int8_t);
+      break;
+    }
+    case ScalarType::UInt16: {
+      typed_requantize_in(uint16_t);
+      break;
+    }
+    case ScalarType::Short: {
+      typed_requantize_in(int16_t);
+      break;
+    }
+    default:
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          false,
+          InvalidArgument,
+          out,
+          "Unhandled input dtype %s",
+          torch::executor::toString(in_dtype));
+  }
+#undef typed_requantize_in
+#undef typed_requantize
+  return out;
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/op_softmax.cpp b/backends/cadence/vision/operators/op_softmax.cpp
new file mode 100644
index 00000000000..58ca33c6a0b
--- /dev/null
+++ b/backends/cadence/vision/operators/op_softmax.cpp
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/util/activation_ops_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/reduce_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <include/api.h>
+#include <include_private/idma_init.h>
+#include <stdio.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::KernelRuntimeContext;
+using torch::executor::Error;
+
+namespace impl {
+namespace vision {
+namespace native {
+
+Tensor& _softmax_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    int64_t dim,
+    bool half_to_float,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      torch::executor::check_softmax_args(in, dim, half_to_float, out),
+      InvalidArgument,
+      out);
+
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensors_have_same_dim_order(in, out),
+      InvalidArgument,
+      out);
+
+  // Adjust for negative dim
+  dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim;
+
+  const executorch::aten::optional<int64_t>& dim_t = dim;
+  const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim());
+  const size_t size = in.size(d);
+
+  size_t stride = 1, outer_size = 1;
+
+  size_t outer_stride = 1;
+
+  constexpr auto name = "_softmax.out";
+  constexpr int MaxDim = 5;
+
+  bool optimized = true;
+  bool ping_pong_process = false;
+  bool ping_process_pong = false;
+
+  if ((d == in.dim() - 1)) {
+    if (size <= IDMA_BUFF_SIZE / 4 && in.dim() != 1) {
+      ping_pong_process = true;
+    } else if (size <= IDMA_BUFF_SIZE / 2) {
+      ping_process_pong = true;
+    }
+  }
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = false;
+
+  if (in.dim() > MaxDim)
+    optimized = false;
+
+  if (optimized) {
+    const float* ptr_inp = (float*)in.const_data_ptr<float>();
+    float* out_data = (float*)out.mutable_data_ptr<float>();
+
+    /* Channel 0*/
+    idma_init(0, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL);
+    idma_init_loop(0, descbuf[0], IDMA_2D_DESC, 1, NULL, NULL);
+
+    /* Channel 1*/
+    idma_init(1, 0, MAX_BLOCK_16, 8, TICK_CYCLES_1, 0, NULL);
+    idma_init_loop(1, descbuf[1], IDMA_2D_DESC, 1, NULL, NULL);
+
+    if (ping_pong_process) {
+      for (int i = 0; i < in.dim(); i++) {
+        if (i != d)
+          outer_size *= in.size(i);
+      }
+
+      outer_stride = size;
+      stride = size;
+
+      int pp_swap = 0;
+
+      float32_t* ptr_out = out_data;
+      float32_t* ptr_in = (float32_t*)ptr_inp;
+
+      idma_copy_2d_desc(
+          0, inpData[pp_swap], ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
+      pp_swap = 1;
+
+      for (int i = 0; i < (outer_size - 1); i++) {
+        IDMA_HW_WAIT_ALL(0);
+        ptr_in += outer_stride;
+        idma_copy_2d_desc(
+            0,
+            inpData[pp_swap],
+            ptr_in,
+            4 * stride,
+            DESC_IDMA_PRIOR_H,
+            1,
+            0,
+            0);
+        pp_swap = pp_swap ^ 1;
+
+        /* PROCESS CALL */
+        vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride);
+
+        IDMA_HW_WAIT_ALL(1);
+        idma_copy_2d_desc(
+            1,
+            ptr_out,
+            outData[pp_swap],
+            4 * stride,
+            DESC_IDMA_PRIOR_H,
+            1,
+            0,
+            0);
+        ptr_out += outer_stride;
+      }
+
+      IDMA_HW_WAIT_ALL(0);
+      pp_swap = pp_swap ^ 1;
+
+      /* PROCESS CALL */
+      vsoftmaxf(outData[pp_swap], inpData[pp_swap], stride);
+
+      IDMA_HW_WAIT_ALL(1);
+      idma_copy_2d_desc(
+          1, ptr_out, outData[pp_swap], 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
+
+      IDMA_HW_WAIT_ALL(1);
+
+      return out;
+    } else if (ping_process_pong) {
+      for (int i = 0; i < in.dim(); i++) {
+        if (i != d)
+          outer_size *= in.size(i);
+      }
+
+      outer_stride = size;
+      stride = size;
+
+      float32_t* ptr_out = out_data;
+      float32_t* ptr_in = (float32_t*)ptr_inp;
+
+      for (int i = 0; i < outer_size; i++) {
+        idma_copy_2d_desc(
+            0, data_dram0, ptr_in, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
+        IDMA_HW_WAIT_ALL(0);
+
+        vsoftmaxf(data_dram1, data_dram0, stride);
+
+        idma_copy_2d_desc(
+            1, ptr_out, data_dram1, 4 * stride, DESC_IDMA_PRIOR_H, 1, 0, 0);
+        IDMA_HW_WAIT_ALL(1);
+
+        ptr_in += outer_stride;
+        ptr_out += outer_stride;
+      }
+
+      return out;
+    } else {
+      int num_inp_dims = in.dim();
+      int num_out_dims = num_inp_dims;
+
+      int ptr_inp_shape[MaxDim];
+      int ptr_out_shape[MaxDim];
+      int ptr_permute_vec[MaxDim];
+
+      for (int i = 0; i < num_inp_dims; i++)
+        ptr_inp_shape[i] = in.size(i);
+
+      for (int i = 0; i < num_inp_dims; i++) {
+        if (i == d)
+          ptr_permute_vec[i] = num_inp_dims - 1;
+        else if (i == (num_inp_dims - 1))
+          ptr_permute_vec[num_inp_dims - 1] = d;
+        else
+          ptr_permute_vec[i] = i;
+
+        ptr_out_shape[i] = ptr_inp_shape[ptr_permute_vec[i]];
+
+        if (i != d)
+          outer_size = outer_size * ptr_inp_shape[i];
+      }
+
+      outer_stride = size;
+
+      float* ptr_out = (float*)kernels::allocate_temp_memory(
+          ctx, out.numel() * sizeof(float));
+
+      ET_KERNEL_CHECK(ctx, ptr_out != nullptr, MemoryAllocationFailed, out);
+
+      float* ptr_out1 = (float*)kernels::allocate_temp_memory(
+          ctx, out.numel() * sizeof(float));
+
+      ET_KERNEL_CHECK(ctx, ptr_out1 != nullptr, MemoryAllocationFailed, out);
+
+      tensor_transposef(
+          ptr_out,
+          ptr_out_shape,
+          ptr_inp,
+          ptr_inp_shape,
+          ptr_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+        size_t outer = outer_idx * outer_stride;
+        for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) {
+          size_t base = outer + inner_idx;
+
+          float* ptr_in_data = &ptr_out[base];
+          float* ptr_out_data = &ptr_out1[base];
+
+          vsoftmaxf(ptr_out_data, ptr_in_data, size);
+        }
+      }
+
+      tensor_transposef(
+          out_data,
+          ptr_inp_shape,
+          ptr_out1,
+          ptr_out_shape,
+          ptr_permute_vec,
+          num_out_dims,
+          num_inp_dims);
+
+      return out;
+    }
+  }
+
+  ET_SWITCH_FLOATHBF16_TYPES(
+      in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
+        const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+        CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+
+        torch::executor::apply_over_dim(
+            [in_data, out_data](
+                const size_t size, const size_t stride, const size_t base) {
+              // calculate max in softmax dim. During softmax computation each
+              // value is subtracted by the maximum in value before calling exp
+              // to preserve numerical stability.
+              const CTYPE max_in = torch::executor::apply_unary_reduce_fn(
+                  [](const CTYPE val_in, CTYPE val_accum) {
+                    return std::max(val_in, val_accum);
+                  },
+                  in_data + base,
+                  size,
+                  stride);
+
+              const CTYPE temp_sum =
+                  torch::executor::apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+                      [max_in](const CTYPE val_in) {
+                        return std::exp(val_in - max_in);
+                      },
+                      [](const CTYPE mapped_in, CTYPE val_accum) {
+                        return val_accum + mapped_in;
+                      },
+                      in_data + base,
+                      size,
+                      stride);
+
+              torch::executor::apply_unary_map_fn(
+                  [max_in, temp_sum](const CTYPE val_in) {
+                    return std::exp(val_in - max_in) / temp_sum;
+                  },
+                  in_data + base,
+                  out_data + base,
+                  size,
+                  stride);
+            },
+            in,
+            dim);
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/generic/operators/op_view_copy.cpp b/backends/cadence/vision/operators/op_view_copy.cpp
similarity index 80%
rename from backends/cadence/generic/operators/op_view_copy.cpp
rename to backends/cadence/vision/operators/op_view_copy.cpp
index 162e9ee201b..6d4d3a8a5e0 100644
--- a/backends/cadence/generic/operators/op_view_copy.cpp
+++ b/backends/cadence/vision/operators/op_view_copy.cpp
@@ -8,10 +8,12 @@
 
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-namespace torch {
-namespace executor {
+namespace impl {
+namespace vision {
 namespace native {
 
+using executorch::aten::IntArrayRef;
+using ::executorch::aten::IntArrayRef;
 using executorch::aten::Tensor;
 using executorch::runtime::KernelRuntimeContext;
 
@@ -25,5 +27,5 @@ Tensor& view_copy_out(
 }
 
 } // namespace native
-} // namespace executor
-} // namespace torch
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/operators.h b/backends/cadence/vision/operators/operators.h
new file mode 100644
index 00000000000..36c4486bf85
--- /dev/null
+++ b/backends/cadence/vision/operators/operators.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/array_ref.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <optional>
+
+namespace impl {
+namespace vision {
+namespace native {
+
+using ::executorch::runtime::getLeadingDims;
+
+#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
+  _(uint8_t, Byte)                           \
+  _(int8_t, Char)
+
+inline __attribute__((always_inline)) void linear_(
+    const ::executorch::aten::Tensor& input,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
+    ::executorch::aten::Tensor& output) {
+  const float* __restrict__ input_data = input.const_data_ptr<float>();
+  const float* __restrict__ weight_data = weight.const_data_ptr<float>();
+  const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
+  float* __restrict__ output_data = output.mutable_data_ptr<float>();
+
+  // input comes in shape [batch_size, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [batch_size, out_dim]
+  // Perform matrix multiply (M x N) x (N x P) => M x P
+  int64_t M = weight.size(0); // = out_dim
+  int64_t N = weight.size(1); // = in_dim
+
+  // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
+  // leading dimensions is d0 * d1 * ... * d_{N-2}
+  int64_t leading_dims = getLeadingDims(input, input.dim() - 1);
+
+  for (int i = 0; i < leading_dims; ++i) {
+    for (int j = 0; j < M; ++j) {
+      float sum = bias_data[j];
+      for (int k = 0; k < N; ++k) {
+        sum += input_data[i * N + k] * weight_data[j * N + k];
+      }
+      output_data[i * M + j] = sum;
+    }
+  }
+}
+
+} // namespace native
+} // namespace vision
+} // namespace impl
diff --git a/backends/cadence/vision/operators/quantized_ops.h b/backends/cadence/vision/operators/quantized_ops.h
new file mode 100644
index 00000000000..a7251724c53
--- /dev/null
+++ b/backends/cadence/vision/operators/quantized_ops.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/cadence/vision/kernels/kernels.h>
+#include <executorch/backends/cadence/vision/operators/operators.h>
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  const int64_t leading_dims =
+      executorch::runtime::getLeadingDims(src, src.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const T* __restrict__ in_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+
+  // Compute the requant_scale from out_multiplier and out_shift
+  const float requant_scale =
+      -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
+
+  for (size_t i = 0; i < leading_dims; ++i) {
+    for (size_t j = 0; j < out_dim; ++j) {
+      int32_t sum = bias_data[j];
+      for (size_t k = 0; k < in_dim; ++k) {
+        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
+        int32_t w =
+            (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
+        sum += x * w;
+      }
+      out_data[i * out_dim + j] = impl::vision::kernels::quantize<T>(
+          sum, requant_scale, out_zero_point);
+    }
+  }
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    const ::executorch::aten::Tensor& weight_zero_point_t,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // Get the zero_point of weight.
+  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
+  quantized_linear_per_tensor_<T>(
+      src,
+      weight,
+      bias,
+      src_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      out);
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_per_channel_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    int64_t weight_zero_point,
+    const ::executorch::aten::Tensor& out_multiplier,
+    const ::executorch::aten::Tensor& out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  int64_t leading_dims =
+      executorch::runtime::getLeadingDims(src, src.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const T* __restrict__ in_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+  const int32_t* __restrict__ out_multiplier_data =
+      out_multiplier.const_data_ptr<int32_t>();
+  const int32_t* __restrict__ out_shift_data =
+      out_shift.const_data_ptr<int32_t>();
+
+  for (size_t i = 0; i < leading_dims; ++i) {
+    for (size_t j = 0; j < out_dim; ++j) {
+      int32_t sum = bias_data[j];
+      for (size_t k = 0; k < in_dim; ++k) {
+        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
+        int32_t w =
+            (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
+        sum += x * w;
+      }
+      // Compute the out_scale from out_multiplier and out_shift
+      const float out_scale =
+          -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
+      out_data[i * out_dim + j] =
+          impl::vision::kernels::quantize<T>(sum, out_scale, out_zero_point);
+    }
+  }
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    int64_t weight_zero_point,
+    const ::executorch::aten::Tensor& out_multiplier,
+    const ::executorch::aten::Tensor& out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  if (out_multiplier.numel() == 1) {
+    // Use per-tensor quantization kernel.
+    const int32_t* __restrict__ out_multiplier_data =
+        out_multiplier.const_data_ptr<int32_t>();
+    const int32_t* __restrict__ out_shift_data =
+        out_shift.const_data_ptr<int32_t>();
+    quantized_linear_per_tensor_<T>(
+        src,
+        weight,
+        bias,
+        src_zero_point,
+        weight_zero_point,
+        out_multiplier_data[0],
+        out_shift_data[0],
+        out_zero_point,
+        out);
+    return;
+  }
+
+  // Use per-channel quantization kernel.
+  quantized_linear_per_channel_<T>(
+      src,
+      weight,
+      bias,
+      src_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      out);
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    const ::executorch::aten::Tensor& weight_zero_point_t,
+    const ::executorch::aten::Tensor& out_multiplier,
+    const ::executorch::aten::Tensor& out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // Get the zero_point of weight.
+  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
+  quantized_linear_<T>(
+      src,
+      weight,
+      bias,
+      src_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      out);
+}
diff --git a/backends/cadence/vision/operators/targets.bzl b/backends/cadence/vision/operators/targets.bzl
new file mode 100644
index 00000000000..2dd47e12bd2
--- /dev/null
+++ b/backends/cadence/vision/operators/targets.bzl
@@ -0,0 +1,83 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+
+def define_operator(name: str, deps: list[str] | None = None) -> None:
+    op_name = "op_{}".format(name)
+
+    # Deps used by all operators.
+    common_deps = [
+        "//executorch/kernels/portable/cpu/util:all_deps",
+        "//executorch/kernels/portable/cpu/pattern:all_deps",
+        "//executorch/runtime/kernel:kernel_includes",
+        "//executorch/kernels/portable/cpu:scalar_utils",
+        "//executorch/backends/cadence/vision/kernels:cadence_kernels",
+        "//executorch/kernels/portable/cpu/util:dtype_util",
+        "//executorch/kernels/portable/cpu/util:elementwise_util",
+        "//executorch/kernels/portable/cpu/pattern:bitwise_op",
+        "//executorch/backends/cadence/vision/third-party:vision-nnlib",
+        "//executorch/kernels/portable/cpu/pattern:comparison_op"
+    ]
+    if deps == None:
+        deps = []
+
+    # Determine which headers to export based on operator name
+    exported_headers = ["operators.h"]
+    
+    # Add quantized_ops.h header for quantized operators
+    quantized_ops = [
+        "quantized_fully_connected_out",
+        "quantized_matmul_out", 
+        "quantized_layer_norm",
+        "quantized_relu_out",
+        "quantized_conv_out",
+        "quantized_linear_out",
+        "quantize_per_tensor",
+        "dequantize_per_tensor",
+        "requantize_out"
+    ]
+    
+    if name in quantized_ops:
+        exported_headers.append("quantized_ops.h")
+
+    runtime.cxx_library(
+        name = op_name,
+        srcs = [op_name + ".cpp"],
+        platforms = CXX,
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compatible_with = ["ovr_config//cpu:xtensa"],
+        deps = deps + common_deps,
+        exported_headers = exported_headers,
+    )
+
+OPERATORS = [
+    "add",
+    "full",
+    "quantized_fully_connected_out",
+    "quantized_matmul_out",
+    "requantize_out",
+    "dequantize_per_tensor",
+    "im2row_out",
+    "quantized_layer_norm",
+    "quantized_relu_out",
+    "softmax",
+    "embedding",
+    "quantized_conv_out",
+    "quantized_linear_out",
+    "quantize_per_tensor",
+    "view_copy"
+]
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Define build targets for all operators registered in the tables above.
+    for op in OPERATORS:
+        define_operator(op)
diff --git a/backends/cadence/vision/third-party/dummy.c b/backends/cadence/vision/third-party/dummy.c
new file mode 100644
index 00000000000..52fb7c18c38
--- /dev/null
+++ b/backends/cadence/vision/third-party/dummy.c
@@ -0,0 +1,17 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/* Dummy source file for non-Xtensa builds
+ * This file is used when building the vision-nnlib library on platforms
+ * other than Xtensa, providing empty stubs for compatibility.
+ * The actual function implementations are provided as stubs via DISCARD_FUN
+ * in headers when COMPILER_XTENSA is not defined.
+ */
+
+// This file intentionally contains no function definitions and no includes.
+// When COMPILER_XTENSA is not defined, all functions are stubbed out
+// using the DISCARD_FUN macro in the header files.
diff --git a/backends/cadence/vision/third-party/include/api.h b/backends/cadence/vision/third-party/include/api.h
new file mode 100644
index 00000000000..efb80c3d76d
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/api.h
@@ -0,0 +1,83 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+ * API
+ */
+
+#ifndef __API_H__
+#define __API_H__
+
+#include "dtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*-------------------------------------------------------------------------
+Softmax
+
+Description: The function computes the softmax (normalized exponential
+function) of input data. 16-bit fixed-point functions accept inputs in
+Q3.12 and form outputs in Q7.8 format.
+
+vsoftmax          16-bit
+vsoftmax_fp16     IEEE-754 Std. half precision floating-point.
+vsoftmaxf         IEEE-754 Std. single precision floating-point.
+
+Accuracy:
+2 LSB for fixed point API
+2 ULP for floating point API
+NOTE: Accuracy of function may depend on amount of data and their
+distribution. Given accuracy is achieved for N=2 for any pair of
+data from input domain.
+
+
+Parameters:
+Input:
+x[N]   input data, Q3.12 floating point
+N      Length of input/output data vectors
+Output:
+y[N]   result, Q7.8 or floating point
+
+Restrictions:
+x,y    aligned on 2*BBE_SIMD_WIDTH-bytes boundary (vsoftmax)
+x,y    Must not overlap
+N      multiple of BBE_SIMD_WIDTH (vsoftmax)
+-------------------------------------------------------------------------*/
+void vsoftmaxf(float32_t *y, const float32_t *x, int N);
+
+void tensor_transposef(float32_t *restrict ptr_out
+    ,const int *const ptr_out_shape
+    ,const float32_t *restrict ptr_inp
+    ,const int *const ptr_inp_shape
+    ,const int *restrict ptr_permute_vec
+    ,int num_out_dims
+    ,int num_inp_dims);
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* __API_H__ */
diff --git a/backends/cadence/vision/third-party/include/dtypes.h b/backends/cadence/vision/third-party/include/dtypes.h
new file mode 100644
index 00000000000..c12bbf23ac2
--- /dev/null
+++ b/backends/cadence/vision/third-party/include/dtypes.h
@@ -0,0 +1,380 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+ * Cross-platform data type definitions and utility macros
+ */
+
+#ifndef __DTYPES_H__
+#define __DTYPES_H__
+
+#include <stddef.h>
+
+#ifndef COMPILER_ANSI
+/* ----------------------------------------------------------
+             Compilers autodetection
+ ----------------------------------------------------------*/
+#define ___UNKNOWN_COMPILER_YET
+#ifdef ___UNKNOWN_COMPILER_YET
+#ifdef _MSC_VER
+
+#ifdef _ARM_
+#define COMPILER_CEARM9E /* Microsoft Visual C++,ARM9E */
+#else
+#define COMPILER_MSVC /* Microsoft Visual C++ */
+#endif
+
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#endif
+
+#ifdef ___UNKNOWN_COMPILER_YET
+#ifdef _TMS320C6X
+#if defined(_TMS320C6400)
+#define COMPILER_C64
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#if defined(_TMS320C6400_PLUS)
+#define COMPILER_C64PLUS
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#endif
+#endif
+
+#ifdef ___UNKNOWN_COMPILER_YET
+#ifdef __TMS320C55X__
+#define COMPILER_C55
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#endif
+
+#ifdef ___UNKNOWN_COMPILER_YET
+#ifdef __ADSPBLACKFIN__
+#define COMPILER_ADSP_BLACKFIN
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#endif
+
+#ifdef ___UNKNOWN_COMPILER_YET
+#ifdef __XCC__
+#define COMPILER_XTENSA
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#endif
+
+#ifdef ___UNKNOWN_COMPILER_YET
+#ifdef __GNUC__
+#ifdef __arm__
+#ifndef COMPILER_GNU_ARM
+#endif
+#define COMPILER_GNUARM /* GNU C/C++ compiler*/
+#else
+/* GNU GCC x86 compiler */
+#ifndef COMPILER_GNU
+#endif
+#define COMPILER_GNU /* GNU C/C++ */
+#endif
+#undef ___UNKNOWN_COMPILER_YET
+#endif
+#endif
+
+#ifdef ___UNKNOWN_COMPILER_YET
+#error Unknown compiler
+#endif
+
+#endif /* #ifndef COMPILER_ANSI */
+
+/* ----------------------------------------------------------
+        Language-dependent definitions
+ ----------------------------------------------------------*/
+#ifdef __cplusplus
+
+#undef extern_C
+#define extern_C extern "C"
+
+#else
+
+#undef extern_C
+#define extern_C
+
+#ifndef false
+#define false 0
+#endif
+#ifndef true
+#define true 1
+#endif
+
+#endif
+
+/*    Assertion support                   */
+#if !defined(_ASSERT)
+#include <assert.h>
+#if defined(_DEBUG) /*&& defined(COMPILER_MSVC)*/
+#define ASSERT(x)                                                              \
+  { assert(x); }
+#else
+
+/*#undef ASSERT*/
+#ifndef ASSERT
+#define ASSERT(_ignore) ((void)0)
+#endif
+
+#endif /* _DEBUG */
+#else  /* ASSERT*/
+#define ASSERT(exp)                                                            \
+  {                                                                            \
+    extern void ExternalAssertHandler(void *, void *, unsigned);               \
+    (void)((exp) || (ExternalAssertHandler(#exp, __FILE__, __LINE__), 0));     \
+  }
+#endif /* ASSERT */
+
+/*** Inline methods definition ***/
+#undef inline_
+#if (defined COMPILER_MSVC) || (defined COMPILER_CEARM9E)
+#define inline_ __inline
+#elif defined(COMPILER_ADSP_BLACKFIN)
+#define inline_ inline
+#elif defined(COMPILER_ANSI)
+#define inline_
+#elif (defined COMPILER_GNU) || (defined COMPILER_GNUARM) ||                   \
+    (defined COMPILER_ARM)
+#define inline_ static inline
+#else
+#define inline_ static inline
+#endif
+
+#ifndef MAX_INT16
+#define MAX_INT16 ((int16_t)0x7FFF)
+#endif
+#ifndef MIN_INT16
+#define MIN_INT16 ((int16_t)0x8000)
+#endif
+#ifndef MAX_INT32
+#define MAX_INT32 ((int32_t)0x7FFFFFFFL)
+#endif
+#ifndef MIN_INT32
+#define MIN_INT32 ((int32_t)0x80000000L)
+#endif
+#ifndef MIN_INT64
+#define MIN_INT64 ((int64_t)0x8000000000000000LL)
+#endif
+#ifndef MAX_INT64
+#define MAX_INT64 ((int64_t)0x7fffffffffffffffLL)
+#endif
+
+/* size of variables in bytes */
+#ifdef COMPILER_C55
+#define SIZEOF_BYTE(x) (sizeof(x) << 1)
+#else
+#define SIZEOF_BYTE(x) sizeof(x)
+#endif
+
+/*---------------------------------------
+ special keywords definition
+ restrict  keyword means that the memory
+           is addressed exclusively via
+           this pointer
+ onchip    keyword means that the memory
+           is on-chip and can not be
+           accessed via external bus
+---------------------------------------*/
+#if defined(COMPILER_C55)
+#define NASSERT _nassert
+#elif defined(COMPILER_C64)
+#define onchip
+#define NASSERT _nassert
+#elif defined(COMPILER_ADSP_BLACKFIN)
+#define onchip
+#define NASSERT(x) __builtin_assert(x)
+#elif defined(COMPILER_GNUARM)
+#define onchip
+#define NASSERT(x)                                                             \
+  { (void)__builtin_expect((x) != 0, 1); }
+#define restrict __restrict
+#elif defined(COMPILER_GNU)
+#define onchip
+#define NASSERT(x)                                                             \
+  {                                                                            \
+    (void)__builtin_expect((x) != 0, 1);                                       \
+    ASSERT(x);                                                                 \
+  }
+#define restrict __restrict
+#elif defined(COMPILER_CEARM9E)
+#define onchip
+#define NASSERT(x)
+#define restrict
+#elif defined(COMPILER_XTENSA)
+#ifndef restrict
+#define restrict __restrict
+#endif
+#define onchip
+#define NASSERT(x)                                                             \
+  {                                                                            \
+    (void)__builtin_expect((x) != 0, 1);                                       \
+    ASSERT(x);                                                                 \
+  }
+#else
+#define restrict
+#define onchip
+#define NASSERT ASSERT
+#endif
+#if defined(COMPILER_ADSP_BLACKFIN)
+#define NASSERT_ALIGN(addr, align) __builtin_aligned(addr, align)
+#else
+#define NASSERT_ALIGN(addr, align) NASSERT(((uintptr_t)(addr)) % (align) == 0)
+#endif
+#define NASSERT_ALIGN2(addr) NASSERT_ALIGN(addr, 2)
+#define NASSERT_ALIGN4(addr) NASSERT_ALIGN(addr, 4)
+#define NASSERT_ALIGN8(addr) NASSERT_ALIGN(addr, 8)
+#define NASSERT_ALIGN16(addr) NASSERT_ALIGN(addr, 16)
+#define NASSERT_ALIGN32(addr) NASSERT_ALIGN(addr, 32)
+#define NASSERT_ALIGN64(addr) NASSERT_ALIGN(addr, 64)
+#define NASSERT_ALIGN128(addr) NASSERT_ALIGN(addr, 128)
+/* ----------------------------------------------------------
+             Common types
+ ----------------------------------------------------------*/
+#if defined(COMPILER_GNU) | defined(COMPILER_GNUARM) | defined(COMPILER_XTENSA)
+/*
+  typedef signed char   int8_t;
+  typedef unsigned char uint8_t;
+*/
+#include <inttypes.h>
+#elif defined(COMPILER_C64)
+#include <stdint.h>
+#elif defined(COMPILER_C55)
+#include <stdint.h>
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+#elif defined(COMPILER_ADSP_BLACKFIN)
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned short uint16_t;
+typedef long int32_t;
+typedef short int16_t;
+typedef long long int64_t;
+typedef unsigned long long uint64_t;
+typedef uint32_t uintptr_t;
+#else
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef unsigned long uint32_t;
+typedef unsigned short uint16_t;
+typedef long int32_t;
+typedef short int16_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+
+#if defined(COMPILER_CEARM9E)
+typedef uint32_t uintptr_t;
+#endif
+
+#if defined(COMPILER_ARM)
+typedef uint32_t uintptr_t;
+#endif
+
+typedef int16_t float16_t;
+typedef float float32_t;
+typedef double float64_t;
+typedef int16_t fract16;
+typedef int32_t fract32;
+
+typedef union tag_complex_fract16 {
+  struct {
+    int16_t re, im;
+  } s;
+  uint32_t a; /* just for 32-bit alignment */
+} complex_fract16;
+
+typedef union tag_complex_fract32 {
+  struct {
+    int32_t re, im;
+  } s;
+  uint64_t a; /* just for 64-bit alignment */
+} complex_fract32;
+
+#if defined(COMPILER_MSVC)
+#if 0
+/* Note: Visual Studio does not support C99 compatible complex types yet */
+typedef union tag_complex_float {
+  struct {
+    float32_t re, im;
+  } s;
+  uint64_t a; /* just for 64-bit alignment */
+} complex_float;
+typedef union tag_complex_double {
+  struct {
+    float64_t re, im;
+  } s;
+  uint64_t a[2]; /* only 64-bit alignment under Visual Studio :(( */
+} complex_double;
+
+inline_ float32_t crealf(complex_float x) { return x.s.re; }
+inline_ float32_t cimagf(complex_float x) { return x.s.im; }
+inline_ float64_t creal(complex_double x) { return x.s.re; }
+inline_ float64_t cimag(complex_double x) { return x.s.im; }
+#else
+#include <complex.h>
+#define complex_float _Fcomplex
+#define complex_double _Dcomplex
+#endif
+
+#else
+/* C99 compatible type */
+#include <complex.h>
+#define complex_float __complex__ float
+#define complex_double __complex__ double
+#endif
+
+/* complex half-precision datatype */
+typedef union tag_complex_float16 {
+  struct {
+    float16_t re, im;
+  } s;
+  uint32_t a; /* just for 32-bit alignment */
+} complex_float16;
+
+inline_ float16_t crealh(complex_float16 x) { return x.s.re; }
+inline_ float16_t cimagh(complex_float16 x) { return x.s.im; }
+/*    union data type for writing float32_t/float64_t constants in a bitexact
+ * form */
+union ufloat32uint32 {
+  uint32_t u;
+  float32_t f;
+};
+union ufloat64uint64 {
+  uint64_t u;
+  float64_t f;
+};
+union ufloat16uint16 {
+  uint16_t u;
+  float16_t f;
+};
+
+#if defined(__RENAMING__)
+#include "__renaming__.h"
+#endif
+
+#endif /* __DTYPE_H__ */
diff --git a/backends/cadence/vision/third-party/include_private/common.h b/backends/cadence/vision/third-party/include_private/common.h
new file mode 100644
index 00000000000..4fc07d8b4d1
--- /dev/null
+++ b/backends/cadence/vision/third-party/include_private/common.h
@@ -0,0 +1,199 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#if defined COMPILER_XTENSA
+#include <xtensa/config/core-isa.h>
+#include <xtensa/tie/xt_ivpn.h>
+#include <xtensa/tie/xt_ivpn_verification.h>
+#include <xtensa/tie/xt_core.h>
+#include <xtensa/tie/xt_density.h>
+#include <xtensa/tie/xt_misc.h>
+#if XCHAL_HAVE_IDMA
+#ifndef IDMA_USE_MULTICHANNEL
+  #define IDMA_USE_MULTICHANNEL 1
+#endif
+#include <xtensa/idma.h>
+#endif
+#define IVP_SIMD_WIDTH XCHAL_IVPN_SIMD_WIDTH
+
+#include "xtensa/config/core-isa.h"
+#include "xtensa/tie/xt_ivpn.h"
+#if XCHAL_HAVE_IDMA
+#include "xtensa/idma.h"
+#endif
+
+#ifdef _MSC_VER
+#define ALIGN(x) _declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+#ifdef COMPILER_XTENSA
+#define ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#define ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
+#define ATTRIBUTE_UNUSED __attribute__((unused))
+#else
+#define ATTRIBUTE_ALWAYS_INLINE
+#define ATTRIBUTE_NEVER_INLINE
+#define ATTRIBUTE_UNUSED
+#endif
+
+/* 'restrict' qualifier, is applied to pointers only under clang compiler */
+#ifdef __clang__
+#define restrict_clang restrict
+#else
+#define restrict_clang
+#endif
+
+// Performance measurement macros
+#define XTPERF_PRINTF(...) printf(__VA_ARGS__)
+#define TIME_DECL(test) long start_time_##test, end_time_##test;
+#define TIME_START(test) { start_time_##test = 0;   XT_WSR_CCOUNT(0); }
+#define TIME_END(test) { end_time_##test = XT_RSR_CCOUNT(); }
+#define TIME_DISPLAY(test, opcnt, opname) { long long cycles_##test = end_time_##test - start_time_##test; \
+		XTPERF_PRINTF("PERF_LOG : %s : %d : %s : %lld : cycles : %.2f : %s/cycle : %.2f : cycles/%s\n", \
+		       #test, opcnt, opname, cycles_##test, cycles_##test == 0 ? 0 : (double)(opcnt)/cycles_##test, \
+           opname, cycles_##test == 0 ? 0 : 1/((double)(opcnt)/cycles_##test), opname); }
+
+//-----------------------------------------------------
+// log2(BBE_SIMD_WIDTH)
+//-----------------------------------------------------
+#define LOG2_IVP_SIMD_WIDTH 5
+#define ALIGN_SIMD ALIGN(64)
+#define ALIGN_2SIMD ALIGN(128)
+
+#define LOG2_SIMD_N_2 (LOG2_IVP_SIMD_WIDTH - 1)
+#define LOG2_SIMD_2N (LOG2_IVP_SIMD_WIDTH + 1)
+//-----------------------------------------------------
+// some C++ support
+//-----------------------------------------------------
+
+// special XCC type casting of pointers
+#ifdef __cplusplus
+#define castxcc(type_, ptr) (ptr)
+#else
+#define castxcc(type_, ptr) (type_ *)(ptr)
+#endif
+
+//-----------------------------------------------------
+// C99 pragma wrapper
+//-----------------------------------------------------
+
+#ifdef COMPILER_XTENSA
+#define __Pragma(a) _Pragma(a)
+#else
+#define __Pragma(a)
+#endif
+
+//-----------------------------------------------------
+// Conditionalization support
+//-----------------------------------------------------
+/* place DISCARD_FUN(retval_type,name) instead of function definition for
+   functions to be discarded from the executable THIS WORKS only for external
+   library functions declared as extern "C" and not supported for internal
+   references without "C" qualifier!
+*/
+#ifdef COMPILER_MSVC
+#pragma section("$DISCARDED_FUNCTIONS", execute, discard)
+#pragma section("$$$$$$$$$$", execute, discard)
+#define DISCARD_FUN(retval_type, name, arglist)                                \
+  __pragma(alloc_text("$DISCARDED_FUNCTIONS", name))                           \
+      __pragma(section("$DISCARDED_FUNCTIONS", execute, discard))              \
+          __pragma(warning(push)) __pragma(warning(disable : 4026 4716))       \
+              retval_type name arglist {}                                      \
+  __pragma(warning(pop))
+#endif
+
+#if defined(COMPILER_XTENSA) || defined(COMPILER_GNU)
+#define DISCARD_FUN(retval_type, name, arglist)                                \
+  __asm__(".type " #name ", @object\n\t.global " #name                         \
+          "\n\t.align 4\n\t" #name ":\n\t.long 0x49438B96,0x4D73F192\n\t");
+#endif
+
+/*------ LIST OF DEFINES DEPENDING ON ISA OPTIONS ------*/
+
+/* Single-precision Extended Vector Floating-point option */
+#if ((XCHAL_HAVE_VISION_SP_VFPU))
+#define HAVE_SPX_VFPU 1
+#else
+#define HAVE_SPX_VFPU 0
+#endif
+
+/* all vector single precision/Extended vector floating point instructions */
+#if ((XCHAL_HAVE_VISION_SP_VFPU))
+#define HAVE_SPX_VFPU 1
+#define HAVE_VFPU 1
+#else
+#define HAVE_SPX_VFPU 0
+#define HAVE_VFPU 0
+#endif
+
+/* all scalar single precision floating point instructions */
+#if ((XCHAL_HAVE_VISION_SP_VFPU) || (XCHAL_HAVE_FP))
+#define HAVE_FPU 1
+#else
+#define HAVE_FPU 0
+#endif
+
+#else
+#define HAVE_VFPU 0
+#define HAVE_FPU 0
+#endif
+
+/* detect if half precision FPU is present in a core */
+#if ((XCHAL_HAVE_VISION_HP_VFPU))
+#define HAVE_HPFPU 1
+#include <xtensa/tie/xt_ivpn_scalarfp.h>
+#else
+#define HAVE_HPFPU 0
+#endif
+
+/* detect if double precision FPU is present in a core */
+#if ((XCHAL_HAVE_VISION_DP_VFPU))
+#define HAVE_DPFPU 1
+#include <xtensa/tie/xt_ivpn_scalarfp.h>
+#else
+#define HAVE_DPFPU 0
+#endif
+
+/*
+  32x32 multiplier
+*/
+#if defined(BBE_MULN_2X32)
+#define HAVE_32X32 1
+#else
+#define HAVE_32X32 0
+#endif
+
+#ifdef __cplusplus
+#define externC extern "C"
+#else
+#define externC extern
+#endif
+
+#endif // __COMMON_H__
diff --git a/backends/cadence/vision/third-party/include_private/expf_tbl.h b/backends/cadence/vision/third-party/include_private/expf_tbl.h
new file mode 100644
index 00000000000..702164aba11
--- /dev/null
+++ b/backends/cadence/vision/third-party/include_private/expf_tbl.h
@@ -0,0 +1,53 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+
+/*
+    tables for expf(x) approximation
+*/
+#ifndef __EXPF_TBL_H__
+#define __EXPF_TBL_H__
+
+/* Portable data types. */
+#include "dtypes.h"
+#include "common.h"
+
+/*
+   polynomial coefficients for 2^x in range 0...1
+
+   derived by MATLAB code:
+   order=6;
+   x=(0:pow2(1,-16):1);
+   y=2.^x;
+   p=polyfit(x,y,6);
+   p(order+1)=1;
+   p(order)=p(order)-(sum(p)-2);
+*/
+externC const int32_t expftbl_Q30[8];
+externC const union ufloat32uint32
+    expfminmax[2]; /* minimum and maximum arguments of expf() input */
+externC const int32_t invln2_Q30; /* 1/ln(2), Q30 */
+externC const union ufloat32uint32 expftblf[7];
+externC const union ufloat32uint32 log2_e[2];
+#endif /* __EXPF_TBL_H__ */
diff --git a/backends/cadence/vision/third-party/include_private/idma_init.h b/backends/cadence/vision/third-party/include_private/idma_init.h
new file mode 100644
index 00000000000..841a39cf891
--- /dev/null
+++ b/backends/cadence/vision/third-party/include_private/idma_init.h
@@ -0,0 +1,36 @@
+#ifndef __IDMA__INIT_H__
+#define __IDMA__INIT_H__
+
+#include "../include/dtypes.h"
+#include "common.h"
+
+#define IDMA_BUFF_SIZE \
+  16384 // 16 kb DRAM storage. Assume 4 buffers (2 input and 2 output)
+
+#ifndef PLACE_IN_DRAM0
+#define PLACE_IN_DRAM0 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram0.data")))
+#endif
+
+#ifndef PLACE_IN_DRAM1
+#define PLACE_IN_DRAM1 \
+  __attribute__((aligned(2 * IVP_SIMD_WIDTH), section(".dram1.data")))
+#endif
+
+float32_t data_dram0[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM0;
+float32_t data_dram1[IDMA_BUFF_SIZE / 2] PLACE_IN_DRAM1;
+
+float32_t* inpData[2] = {&data_dram0[0], &data_dram1[0]};
+float32_t* outData[2] = {
+    &data_dram0[IDMA_BUFF_SIZE / 4],
+    &data_dram1[IDMA_BUFF_SIZE / 4]};
+
+IDMA_BUFFER_DEFINE(buffer_idma_ch0, 1, IDMA_2D_DESC);
+IDMA_BUFFER_DEFINE(buffer_idma_ch1, 1, IDMA_2D_DESC);
+
+idma_buffer_t* descbuf[] = {
+    buffer_idma_ch0,
+    buffer_idma_ch1,
+};
+
+#endif // __IDMA__INIT_H__
diff --git a/backends/cadence/vision/third-party/include_private/inff_tbl.h b/backends/cadence/vision/third-party/include_private/inff_tbl.h
new file mode 100644
index 00000000000..1326e92a3c1
--- /dev/null
+++ b/backends/cadence/vision/third-party/include_private/inff_tbl.h
@@ -0,0 +1,39 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+
+/*
+        Infinities for single precision routines
+*/
+#ifndef __INFF_TBL_H__
+#define __INFF_TBL_H__
+
+#include "dtypes.h"
+#include "common.h"
+
+externC const union ufloat32uint32 minusInff; /* -Inf */
+externC const union ufloat32uint32 plusInff;  /* +Inf */
+externC const union ufloat32uint32 realmaxf; /* maximum floating point number */
+externC const union ufloat32uint32 realminf; /* minimum floating point number */
+#endif                                       /* __INFF_TBL_H__ */
diff --git a/backends/cadence/vision/third-party/include_private/nanf_tbl.h b/backends/cadence/vision/third-party/include_private/nanf_tbl.h
new file mode 100644
index 00000000000..4881b99f070
--- /dev/null
+++ b/backends/cadence/vision/third-party/include_private/nanf_tbl.h
@@ -0,0 +1,42 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+        NaN values for single precision routines
+*/
+
+#ifndef __NANF_TBL_H__
+#define __NANF_TBL_H__
+
+/* Portable data types. */
+#include "dtypes.h"
+/* Common utility macros. */
+#include "common.h"
+
+extern const union ufloat32uint32 sNaNf;       /* Signalling NaN          */
+extern const union ufloat32uint32 qNaNf;       /* Quiet NaN               */
+extern const union ufloat32uint32 minus_sNaNf; /* Negative Signalling NaN */
+extern const union ufloat32uint32 minus_qNaNf; /* Negative Quiet NaN      */
+
+#endif /* __NANF_TBL_H__ */
diff --git a/backends/cadence/vision/third-party/library/api/tensor_transposef.c b/backends/cadence/vision/third-party/library/api/tensor_transposef.c
new file mode 100644
index 00000000000..e6865033740
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/tensor_transposef.c
@@ -0,0 +1,167 @@
+#include "api.h"
+#include "common.h"
+
+/*
+ * Currently only supports upto 5D input tensors.
+ * 1/2/3/4 D input tensors will be scaled up to 5D.
+ * For example, 2x3 -> 1x1x1x2x3.
+ */
+
+void tensor_transposef(float32_t *restrict ptr_out
+    ,const int *const ptr_out_shape
+    ,const float32_t *restrict ptr_inp
+    ,const int *const ptr_inp_shape
+    ,const int *restrict ptr_permute_vec
+    ,int num_out_dims
+    ,int num_inp_dims)
+{
+
+  /* Shift all dim with 1 in the outer part */
+  int eff_output_shape[5];
+  int eff_permute_vec[5];
+
+  for (int i = 0; i < num_out_dims; i++){
+    eff_output_shape[i] = ptr_out_shape[i];
+    eff_permute_vec[i] = ptr_permute_vec[i];
+  }
+
+  int one_i = num_out_dims - 1, non_one_i = num_out_dims - 1;
+  while (one_i > 0 && non_one_i >= 0){
+    while (one_i > 0 && eff_output_shape[one_i] != 1){
+      one_i--;
+    }
+    non_one_i = one_i;
+    while (non_one_i >= 0 && eff_output_shape[non_one_i]==1){
+      non_one_i--;
+    }
+    if (one_i > 0 && non_one_i >= 0){
+      int temp;
+      /*swap output_shape*/
+      {
+        temp = eff_output_shape[one_i];
+        eff_output_shape[one_i] = eff_output_shape[non_one_i];
+        eff_output_shape[non_one_i] = temp;
+      }
+      /*swap permute_vec*/
+      {
+        temp = eff_permute_vec[one_i];
+        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
+        eff_permute_vec[non_one_i] = temp;
+      }
+    }
+  }
+
+  /* Promoting lesser dim tensors to 5D tensors.
+   * Also updating the permute_vec and shapes as needed for optimization */
+  int ptr_5D_inp_shape[5] = {1, 1, 1, 1, 1};
+  int ptr_5D_out_shape[5] = {1, 1, 1, 1, 1};
+  int ptr_5D_permute_vec[5] = {0, 1, 2, 3, 4};
+
+  /* Check if any inner inp dimension is same in the output */
+  int last_dim_same = 1, last_n_same_dim = 0;
+  int itr = num_inp_dims - 1;
+  while(itr >= 0){
+    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
+    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
+    itr--;
+  }
+
+  int dims_added = 5 - num_inp_dims;
+  itr = num_inp_dims - 1;
+  int same_count = last_n_same_dim;
+  int count = 4;
+  while(itr >= 0){
+    ptr_5D_inp_shape[count] = (same_count > 0) ? ptr_5D_inp_shape[count] * ptr_inp_shape[itr] : ptr_inp_shape[itr];
+    ptr_5D_out_shape[count] = (same_count > 0) ? ptr_5D_out_shape[count] * eff_output_shape[itr] : eff_output_shape[itr];
+    same_count--;
+    itr--;
+    count = (same_count > 0) ? count : count - 1;
+  }
+
+  itr = num_inp_dims - 1;
+  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
+  count = 4;
+  while(itr >= 0){
+    ptr_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
+    same_count--;
+    itr--;
+    count--;
+  }
+
+  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
+  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
+  int inp_stride[5];
+
+  out_dim0 = ptr_5D_out_shape[0];
+  out_dim1 = ptr_5D_out_shape[1];
+  out_dim2 = ptr_5D_out_shape[2];
+  out_dim3 = ptr_5D_out_shape[3];
+  out_dim4 = ptr_5D_out_shape[4];
+
+  inp_dim1 = ptr_5D_inp_shape[1];
+  inp_dim2 = ptr_5D_inp_shape[2];
+  inp_dim3 = ptr_5D_inp_shape[3];
+  inp_dim4 = ptr_5D_inp_shape[4];
+
+  inp_stride[0] = inp_dim1 * inp_dim2 * inp_dim3 * inp_dim4;
+  inp_stride[1] = inp_dim2 * inp_dim3 * inp_dim4;
+  inp_stride[2] = inp_dim3 * inp_dim4;
+  inp_stride[3] = inp_dim4;
+  inp_stride[4] = 1;
+
+  if (last_n_same_dim){
+    int itr0, itr1, itr2, itr3, itr4;
+    float32_t *ptr_inp0 = (float32_t *)ptr_inp;
+    for (itr0 = 0; itr0 < out_dim0; itr0++){
+      float32_t *ptr_inp1 = ptr_inp0 + (itr0 * inp_stride[ptr_5D_permute_vec[0]]);
+#pragma looptr_count min=1
+      for (itr1 = 0; itr1 < out_dim1; itr1++){
+        float32_t *ptr_inp2 = ptr_inp1 + (itr1 * inp_stride[ptr_5D_permute_vec[1]]);
+#pragma looptr_count min=1
+        for (itr2 = 0; itr2 < out_dim2; itr2++){
+          float32_t *ptr_inp3 = ptr_inp2 + (itr2 * inp_stride[ptr_5D_permute_vec[2]]);
+#pragma looptr_count min=1
+          for (itr3 = 0; itr3 < out_dim3; itr3++, ptr_out += out_dim4){
+            float32_t *ptr_inp4 = ptr_inp3 + (itr3 * inp_stride[ptr_5D_permute_vec[3]]);
+            xb_vecN_2xf32 *restrict pae_i = (xb_vecN_2xf32 *)(ptr_inp4);
+            xb_vecN_2xf32 *restrict pae_o = (xb_vecN_2xf32 *)(ptr_out);
+            valign a_inp = IVP_LAN_2XF32_PP(pae_i);
+            valign a_out = IVP_ZALIGN();
+            xb_vecN_2xf32 d0;
+            for(itr4 = 0; itr4 < (out_dim4 >> (LOG2_IVP_SIMD_WIDTH - 1)); itr4++){
+              IVP_LAN_2XF32_IP(d0, a_inp, pae_i);
+              IVP_SAN_2XF32_IP(d0, a_out, pae_o);
+            }
+            IVP_SAPOSN_2XF32_FP(a_out, pae_o);
+            float32_t *restrict puae_i = (float32_t *)(pae_i);
+            float32_t *restrict puae_o = (float32_t *)(pae_o);
+#pragma looptr_count max = 17
+            for(itr4 = 0; itr4 < (out_dim4 & (IVP_SIMD_WIDTH / 2 - 1)); itr4++){
+              puae_o[itr4] = puae_i[itr4];
+            }
+          }
+        }
+      }
+    }
+  }
+  else{
+    int itr0, itr1, itr2, itr3, itr4;
+    float32_t *ptr_inp0 = (float32_t *)ptr_inp;
+    for(itr0 = 0; itr0 < out_dim0; itr0++){
+      float32_t *ptr_inp1 = ptr_inp0 + (itr0 * inp_stride[ptr_5D_permute_vec[0]]);
+      for(itr1 = 0; itr1 < out_dim1; itr1++){
+        float32_t *ptr_inp2 = ptr_inp1 + (itr1 * inp_stride[ptr_5D_permute_vec[1]]);
+        for(itr2 = 0; itr2 < out_dim2; itr2++){
+          float32_t *ptr_inp3 = ptr_inp2 + (itr2 * inp_stride[ptr_5D_permute_vec[2]]);
+          for(itr3 = 0; itr3 < out_dim3; itr3++){
+            float32_t *ptr_inp4 = ptr_inp3 + (itr3 * inp_stride[ptr_5D_permute_vec[3]]);
+            for(itr4 = 0; itr4 < out_dim4; itr4++){
+              *ptr_out++ = *ptr_inp4;
+              ptr_inp4 = ptr_inp4 + inp_stride[ptr_5D_permute_vec[4]];
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/backends/cadence/vision/third-party/library/api/vsoftmaxf.c b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
new file mode 100644
index 00000000000..27487c75d6c
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/api/vsoftmaxf.c
@@ -0,0 +1,241 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+  NatureDSP_Baseband library. Vector Mathematics.
+    Softmax, floating-point data
+*/
+#include "api.h"
+#include "common.h"
+#include "expf_tbl.h"
+#include "inff_tbl.h"
+#include "nanf_tbl.h"
+
+/*-------------------------------------------------------------------------
+Softmax
+
+Description: The function computes the softmax (normalized exponential
+function) of input data. 16-bit fixed-point functions accept inputs in
+Q3.12 and form outputs in Q7.8 format.
+
+vsoftmax          16-bit
+vsoftmax_fp16     IEEE-754 Std. half precision floating-point.
+vsoftmaxf         IEEE-754 Std. single precision floating-point.
+
+Accuracy:
+2 LSB for fixed point API
+2 ULP for floating point API
+NOTE: Accuracy of function may depend on amount of data and their
+distribution. Given accuracy is achieved for N=2 for any pair of
+data from input domain.
+
+
+Parameters:
+Input
+:
+x[N]   input data, Q3.12 floating point
+N      Length of input/output data vectors
+Output:
+y[N]   result, Q7.8 or floating point
+
+Restrictions:
+x,y    Must not overlap
+-------------------------------------------------------------------------*/
+
+#define IVP_ADDSN_2X32(b_, c_)         \
+  ({                                   \
+    xb_vecN_2x32v a_;                  \
+    xb_vecN_2x64w tmp_a_;              \
+    tmp_a_ = IVP_MULN_2X32(b_, 1);     \
+    IVP_MULAN_2X32(tmp_a_, c_, 1);     \
+    a_ = IVP_PACKVRN_2X64W(tmp_a_, 0); \
+    a_;                                \
+  })
+
+#if !HAVE_VFPU
+DISCARD_FUN(void, vsoftmaxf, (float32_t * y, const float32_t* x, int N))
+#else
+void vsoftmaxf(float32_t* y, const float32_t* x, int N) {
+#if !defined(IVP_MULN_2X32)
+#else
+  const int* pTbl = (const int*)expftbl_Q30;
+#endif
+  const xb_vecN_2xf32* restrict pX;
+  xb_vecN_2xf32* restrict pY;
+  xb_vecN_2xf32 norm, ysum, xmax;
+  int n;
+  valign al_X, al_R, al_Y;
+  if (N < 0)
+    return;
+  xmax = minusInff.f;
+  pX = (const xb_vecN_2xf32*)x;
+  al_X = IVP_LAN_2XF32_PP(pX);
+  al_Y = IVP_ZALIGN();
+  for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
+    xb_vecN_2xf32 x;
+    IVP_LAN_2XF32_IP(x, al_X, pX);
+    xmax = IVP_MAXNUMN_2XF32(xmax, x);
+  }
+  if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
+    xb_vecN_2xf32 x;
+    IVP_LAVN_2XF32_XP(
+        x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    IVP_MAXNUMN_2XF32T(
+        xmax, xmax, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+  }
+
+  xmax = IVP_REPN_2XF32(IVP_RMAXNUMN_2XF32(xmax), 0);
+  __Pragma("no_reorder");
+  ysum = 0.f;
+  pX = (const xb_vecN_2xf32*)x;
+  pY = (xb_vecN_2xf32*)y;
+  al_X = IVP_LAN_2XF32_PP(pX);
+  {
+    vboolN_2 bnan;
+    bnan = IVP_LTRN_2I(0);
+    for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
+      xb_vecN_2xf32 x;
+      IVP_LAN_2XF32_IP(x, al_X, pX);
+      x = IVP_SUBN_2XF32(x, xmax);
+      bnan |= IVP_UNN_2XF32(x, x);
+      {
+        xb_vecN_2xf32 gf, zout;
+        xb_vecN_2x32v xin_i, fr, exp, t;
+        xb_vecN_2x32v y, y1, y2, c1, c2, f2;
+        xb_vecN_2x64w w;
+        xin_i = IVP_TRUNCN_2XF32(x, 24);
+        /* Multiply by 1/ln2, extract the integer and fractional (Q32)
+         * components.     */
+        /* Q54 <- Q24*Q30 */
+        w = IVP_MULN_2X32(xin_i, invln2_Q30);
+        exp = IVP_PACKVRNRN_2X64W(w, 54);
+        fr = IVP_SRLN_2X32(IVP_PACKVRNRN_2X64W(w, 22), 1);
+        /* polynomial for 2^x */
+        f2 = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, fr), 31);
+        y1 = IVP_LSRN_2X32_I(pTbl, 0 * sizeof(int32_t));
+        y2 = IVP_LSRN_2X32_I(pTbl, 1 * sizeof(int32_t));
+        c1 = IVP_LSRN_2X32_I(pTbl, 2 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31);
+        y1 = IVP_ADDSN_2X32(c1, t);
+        c2 = IVP_LSRN_2X32_I(pTbl, 3 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31);
+        y2 = IVP_ADDSN_2X32(c2, t);
+        c1 = IVP_LSRN_2X32_I(pTbl, 4 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31);
+        y1 = IVP_ADDSN_2X32(c1, t);
+        c2 = IVP_LSRN_2X32_I(pTbl, 5 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31);
+        y2 = IVP_ADDSN_2X32(c2, t);
+        c1 = IVP_LSRN_2X32_I(pTbl, 6 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31);
+        y1 = IVP_ADDSN_2X32(c1, t);
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, y2), 31);
+        y = IVP_ADDSN_2X32(y1, t);
+        /* scale result to original exponent ignoring very low items */
+        gf = IVP_FLOATN_2X32(y, 30);
+        exp = IVP_SLLIN_2X32(IVP_MAXN_2X32(IVP_ADDN_2X32(127, exp), 0), 23);
+        zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp));
+        x = zout;
+      }
+      ysum = IVP_ADDN_2XF32(ysum, x);
+      IVP_SAN_2XF32_IP(x, al_Y, pY);
+    }
+    if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
+      xb_vecN_2xf32 x;
+      IVP_LAVN_2XF32_XP(
+          x, al_X, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+      x = IVP_SUBN_2XF32(x, xmax);
+      bnan |= IVP_UNN_2XF32(x, x);
+      {
+        xb_vecN_2xf32 gf, zout;
+        xb_vecN_2x32v xin_i, fr, exp, t;
+        xb_vecN_2x32v y, y1, y2, c1, c2, f2;
+        xb_vecN_2x64w w;
+        xin_i = IVP_TRUNCN_2XF32(x, 24);
+        /* Multiply by 1/ln2, extract the integer and fractional (Q32)
+         * components.     */
+        /* Q54 <- Q24*Q30 */
+        w = IVP_MULN_2X32(xin_i, invln2_Q30);
+        exp = IVP_PACKVRNRN_2X64W(w, 54);
+        fr = IVP_SRLN_2X32(IVP_PACKVRNRN_2X64W(w, 22), 1);
+        /* polynomial for 2^x */
+        f2 = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, fr), 31);
+        y1 = IVP_LSRN_2X32_I(pTbl, 0 * sizeof(int32_t));
+        y2 = IVP_LSRN_2X32_I(pTbl, 1 * sizeof(int32_t));
+        c1 = IVP_LSRN_2X32_I(pTbl, 2 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31);
+        y1 = IVP_ADDSN_2X32(c1, t);
+        c2 = IVP_LSRN_2X32_I(pTbl, 3 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31);
+        y2 = IVP_ADDSN_2X32(c2, t);
+        c1 = IVP_LSRN_2X32_I(pTbl, 4 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31);
+        y1 = IVP_ADDSN_2X32(c1, t);
+        c2 = IVP_LSRN_2X32_I(pTbl, 5 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y2), 31);
+        y2 = IVP_ADDSN_2X32(c2, t);
+        c1 = IVP_LSRN_2X32_I(pTbl, 6 * sizeof(int32_t));
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(f2, y1), 31);
+        y1 = IVP_ADDSN_2X32(c1, t);
+        t = IVP_PACKVRN_2X64W(IVP_MULN_2X32(fr, y2), 31);
+        y = IVP_ADDSN_2X32(y1, t);
+        /* scale result to original exponent ignoring very low items */
+        gf = IVP_FLOATN_2X32(y, 30);
+        exp = IVP_SLLIN_2X32(IVP_MAXN_2X32(IVP_ADDN_2X32(127, exp), 0), 23);
+        zout = IVP_MULN_2XF32(gf, IVP_MOVN_2XF32_FROMN_2X32(exp));
+        x = zout;
+      }
+      IVP_ADDN_2XF32T(
+          ysum, ysum, x, IVP_LTRSN_2((N & (IVP_SIMD_WIDTH / 2 - 1))));
+      IVP_SAVN_2XF32_XP(
+          x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    }
+    IVP_SAPOSN_2XF32_FP(al_Y, pY);
+    ysum = IVP_MOVN_2XF32T(qNaNf.f, ysum, bnan);
+  }
+  norm = XT_RECIP_S(IVP_RADDN_2XF32(ysum));
+  __Pragma("no_reorder");
+  pX = (const xb_vecN_2xf32*)y;
+  pY = (xb_vecN_2xf32*)y;
+
+  al_R = IVP_LAN_2XF32_PP(pX);
+
+  for (n = 0; n < (N >> (LOG2_IVP_SIMD_WIDTH - 1)); n++) {
+    xb_vecN_2xf32 x;
+    IVP_LAN_2XF32_IP(x, al_R, pX);
+    x = IVP_MULN_2XF32(x, norm);
+    IVP_SAN_2XF32_IP(x, al_Y, pY);
+  }
+  if (N & (IVP_SIMD_WIDTH / 2 - 1)) {
+    xb_vecN_2xf32 x;
+    IVP_LAVN_2XF32_XP(
+        x, al_R, pX, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+    x = IVP_MULN_2XF32(x, norm);
+    IVP_SAVN_2XF32_XP(
+        x, al_Y, pY, sizeof(float32_t) * (N & (IVP_SIMD_WIDTH / 2 - 1)));
+  }
+  IVP_SAPOSN_2XF32_FP(al_Y, pY);
+
+} /* vsoftmaxf() */
+#endif
diff --git a/backends/cadence/vision/third-party/library/tables/expf_tbl.c b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
new file mode 100644
index 00000000000..f1c6f3d44ae
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/tables/expf_tbl.c
@@ -0,0 +1,85 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+
+/*
+    tables for expf(x) approximation
+*/
+/* Portable data types. */
+#include "expf_tbl.h"
+#include "dtypes.h"
+
+/*
+   polynomial coefficients for 2^x in range 0...1
+
+   derived by MATLAB code:
+   order=6;
+   x=(0:pow2(1,-16):1);
+   y=2.^x;
+   p=polyfit(x,y,6);
+   p(order+1)=1;
+   p(order)=p(order)-(sum(p)-2);
+*/
+const int32_t ALIGN_2SIMD expftbl_Q30[8] = {
+    234841,
+    1329551,
+    10400465,
+    59570027,
+    257946177,
+    744260763,
+    1073741824,
+    0 /* Padding to allow for vector loads */
+};
+
+const union ufloat32uint32 ALIGN_2SIMD
+    expfminmax[2] = /* minimum and maximum arguments of expf() input */
+    {
+        {0xc2ce8ed0}, /*-1.0327893066e+002f */
+        {0x42b17218} /* 8.8722839355e+001f */
+};
+
+const int32_t invln2_Q30 = 1549082005L; /* 1/ln(2), Q30 */
+
+const union ufloat32uint32 ALIGN_2SIMD log2_e[2] = {
+    {0x3fb8aa3b}, /* 1.4426950216      */
+    {0x32a57060} /* 1.9259629891e-008 */
+};
+
+/*
+order=6;
+x=(0:pow2(1,-16):1);
+y=2.^x;
+p=polyfit(x,y,order);
+p(order+1)=1;
+p(order)=p(order)-(sum(p)-2);
+num2hex(single(p));
+*/
+const union ufloat32uint32 ALIGN_2SIMD expftblf[] = {
+    {0x39655635},
+    {0x3aa24c7a},
+    {0x3c1eb2d1},
+    {0x3d633ddb},
+    {0x3e75ff24},
+    {0x3f317212},
+    {0x3f800000}};
diff --git a/backends/cadence/vision/third-party/library/tables/inff_tbl.c b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
new file mode 100644
index 00000000000..8464ee9f549
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/tables/inff_tbl.c
@@ -0,0 +1,38 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+
+/*
+    infinities for single precision routines
+*/
+
+#include "inff_tbl.h"
+#include "dtypes.h"
+
+const union ufloat32uint32 minusInff = {0xff800000}; /* -Inf */
+const union ufloat32uint32 plusInff = {0x7f800000}; /* +Inf */
+const union ufloat32uint32 realmaxf = {
+    0x7f7fffff}; /* maximum floating point number */
+const union ufloat32uint32 realminf = {
+    0x00800000}; /* minimum floating point number */
diff --git a/backends/cadence/vision/third-party/library/tables/nanf_tbl.c b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
new file mode 100644
index 00000000000..f165234fce4
--- /dev/null
+++ b/backends/cadence/vision/third-party/library/tables/nanf_tbl.c
@@ -0,0 +1,38 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2024 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ('Cadence    */
+/* Libraries') are the copyrighted works of Cadence Design Systems Inc.     */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* NatureDSP_Baseband Library                                               */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2009-2022 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+        NaN values for single precision routines
+*/
+
+/* Portable data types. */
+/* NaN values for single precision routines. */
+#include "nanf_tbl.h"
+#include "dtypes.h"
+
+const union ufloat32uint32 sNaNf = {0x7f800001}; /* Signalling NaN          */
+const union ufloat32uint32 qNaNf = {0x7fc00000}; /* Quiet NaN               */
+const union ufloat32uint32 minus_sNaNf = {
+    0xff800001}; /* Negative Signalling NaN */
+const union ufloat32uint32 minus_qNaNf = {0xffc00000}; /* Negative Quiet NaN */
diff --git a/backends/cadence/vision/third-party/targets.bzl b/backends/cadence/vision/third-party/targets.bzl
new file mode 100644
index 00000000000..26a097010d5
--- /dev/null
+++ b/backends/cadence/vision/third-party/targets.bzl
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//arvr/tools/build_defs:oxx.bzl", "oxx_binary", "oxx_static_library")
+
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "vision-nnlib",
+        srcs = select({
+            "DEFAULT": ["dummy.c"],  # Use dummy file for non-Xtensa builds
+            "ovr_config//cpu:xtensa": glob(["library/**/*.c"]),
+        }),
+        exported_headers = glob([
+            "include/*.h", 
+            "include_private/*.h"
+        ]),
+        header_namespace = "",
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        platforms = CXX,
+        compatible_with = select({
+            "DEFAULT": [],
+            "ovr_config//cpu:xtensa": ["ovr_config//cpu:xtensa"],
+        }),
+        compiler_flags = select({
+            "DEFAULT": ["-UCOMPILER_XTENSA"],  # Ensure COMPILER_XTENSA is not defined for non-Xtensa builds
+            "ovr_config//cpu:xtensa": [
+                "-DCOMPILER_XTENSA",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include",
+                "-Ixplat/executorch/backends/cadence/vision/third-party/include_private",
+            ],
+        }),
+        define_static_target = True,
+    )
diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt
index 1567b8b5e1c..a728584e49c 100644
--- a/backends/cortex_m/CMakeLists.txt
+++ b/backends/cortex_m/CMakeLists.txt
@@ -12,7 +12,7 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
-# Source root directory for executorch.
+# Source root directory for executorch
 if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
@@ -21,71 +21,76 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 include(FetchContent)
 
-# CMSIS-NN version to download
+# CMSIS-NN configuration with dynamic path detection
 set(CMSIS_NN_VERSION
-    "v4.1.0"
+    "v7.0.0"
     CACHE STRING "CMSIS-NN version to download"
 )
-
-# Declare CMSIS-NN as a FetchContent project
-FetchContent_Declare(
-  cmsis_nn
-  GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
-  GIT_TAG ${CMSIS_NN_VERSION}
+set(CMSIS_NN_LOCAL_PATH
+    ""
+    CACHE PATH "Path to existing local CMSIS-NN installation"
 )
 
-# Download and make CMSIS-NN available
-FetchContent_MakeAvailable(cmsis_nn)
-
-# Print paths for debugging
-message(STATUS "CMSIS-NN source dir: ${cmsis_nn_SOURCE_DIR}")
-message(STATUS "CMSIS-NN binary dir: ${cmsis_nn_BINARY_DIR}")
+# Try to find existing / local CMSIS-NN installation. This is useful for
+# debugging and testing with local changes. This is not common, as the CMSIS-NN
+# library is downloaded via FetchContent in the default/regular case.
+if(CMSIS_NN_LOCAL_PATH AND EXISTS "${CMSIS_NN_LOCAL_PATH}")
+  message(STATUS "Using CMSIS-NN from specified path: ${CMSIS_NN_LOCAL_PATH}")
+  add_subdirectory(${CMSIS_NN_LOCAL_PATH} _deps/cmsis_nn-build)
+else()
+  # Use FetchContent with automatic fallback
+  message(STATUS "Using CMSIS-NN via FetchContent")
+
+  FetchContent_Declare(
+    cmsis_nn
+    GIT_REPOSITORY https://github.com/ARM-software/CMSIS-NN.git
+    GIT_TAG ${CMSIS_NN_VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  FetchContent_MakeAvailable(cmsis_nn)
+endif()
 
 # Cortex-M ops kernel sources
 set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_dequantize_per_tensor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
 )
 
-# Generate C++ bindings to register kernels into Executorch (for runtime)
+# Generate C++ bindings to register kernels into Executorch
 set(_yaml_file ${CMAKE_CURRENT_LIST_DIR}/ops/operators.yaml)
 gen_selected_ops(LIB_NAME "cortex_m_ops_lib" OPS_SCHEMA_YAML "${_yaml_file}")
-
 generate_bindings_for_kernels(
   LIB_NAME "cortex_m_ops_lib" CUSTOM_OPS_YAML "${_yaml_file}"
 )
-message("Generated files ${gen_command_sources}")
 
-# Build a library for cortex_m_kernels
+# Build library for cortex_m_kernels
 add_library(cortex_m_kernels ${_cortex_m_kernels__srcs})
-target_compile_options(cortex_m_kernels PUBLIC ${_common_compile_options})
 
-# Include directories for cortex_m_kernels
-target_include_directories(
+# Use PRIVATE for implementation dependencies to avoid INTERFACE pollution
+target_link_libraries(
   cortex_m_kernels
-  PRIVATE ${EXECUTORCH_ROOT}/..
-          ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
-          ${cmsis_nn_SOURCE_DIR}/Include
+  PRIVATE cmsis-nn
+  PRIVATE executorch
 )
 
-# Link directly to the CMSIS-NN static library file
-target_link_libraries(
-  cortex_m_kernels PUBLIC ${cmsis_nn_BINARY_DIR}/libcmsis-nn.a executorch
+# Include directories for cortex_m_kernels
+target_include_directories(
+  cortex_m_kernels PRIVATE ${EXECUTORCH_ROOT}/..
+                           ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
 )
 
-# Add dependency to ensure CMSIS-NN builds before we try to link. Use the actual
-# CMSIS-NN target name (usually 'cmsis-nn')
-add_dependencies(cortex_m_kernels cmsis-nn)
-
 # cortex_m_ops_lib: Register Cortex-M ops kernels into Executorch runtime
 gen_operators_lib(
   LIB_NAME "cortex_m_ops_lib" KERNEL_LIBS cortex_m_kernels DEPS executorch
 )
 
 install(
-  TARGETS cortex_m_kernels cortex_m_ops_lib
+  TARGETS cortex_m_kernels cortex_m_ops_lib cmsis-nn
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/backends/cortex_m/ops/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/backends/cortex_m/ops/
 )
diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
new file mode 100644
index 00000000000..4b9fdaebdf7
--- /dev/null
+++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+#include "cortex_m_ops_common.h"
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+// During AOT phase, quantized_linear_fusion_pass allocates total buffer
+// and passes in as 'Tensor'. (Total buffer = 8-byte header + x bytes)
+// ┌─────────────────┬─────────────────────────────────────┐
+// │ KernelSum Header│        CMSIS Workspace              │
+// │    (8 bytes)    │         (x bytes)                   │
+// └─────────────────┴─────────────────────────────────────┘
+//          │                           │
+//          │                           └─> Passed to CMSIS API
+//          │
+//          └─> State for kernel sum
+
+// C++ Runtime:
+// ┌─────────────────┬─────────────────────────────────────┐
+// │ KernelSum Header│        CMSIS Workspace              │
+// │    (8 bytes)    │         (x bytes)                   │
+// └─────────────────┴─────────────────────────────────────┘
+// ^                 ^
+// │                 │
+// scratch_ptr       cmsis_workspace_ptr
+// │                 │
+// ▼                 ▼
+//             arm_vector_sum_s8() writes kernel sums (with bias if avail):
+//             [sum₀+bias₀][sum₁+bias₁][sum₂+bias₂]...[sum_{n-1}+bias_{n-1}]
+//             (n * 4-byte int32_t values = x bytes)
+//
+// - n = out_features (number of output features)
+// - x = n * 4 bytes (total CMSIS buffer size)
+// - Total buffer = 8 + x bytes
+
+class CMSISScratchBufferContext final {
+ public:
+  CMSISScratchBufferContext(
+      Tensor& scratch_buffer,
+      const Tensor& weights,
+      const Tensor& weight_zero_point,
+      const torch::executor::optional<Tensor>& bias)
+      : scratch_ptr_(scratch_buffer.mutable_data_ptr<int8_t>()),
+        total_size_(scratch_buffer.size(0)),
+        base_ptr_(reinterpret_cast<uint8_t*>(scratch_ptr_)),
+        in_features_(weights.size(1)),
+        out_features_(weights.size(0)),
+        is_per_channel_(weight_zero_point.numel() > 1),
+        weight_data_offset_(calculate_offset(weights.const_data_ptr<int8_t>())),
+        weight_zp_data_offset_(
+            calculate_offset(weight_zero_point.const_data_ptr<int32_t>())),
+        bias_data_offset_(
+            bias.has_value()
+                ? calculate_offset(bias.value().const_data_ptr<int32_t>())
+                : 0),
+        header_(reinterpret_cast<KernelSumHeader*>(scratch_ptr_)),
+        cmsis_workspace_ptr_(scratch_ptr_ + KERNEL_SUM_HEADER_SIZE) {
+    cmsis_nn_dims filter_dims = {in_features_, 1, 1, out_features_};
+    validate_size(filter_dims);
+  }
+
+  cmsis_nn_context get_cmsis_ctx() const {
+    cmsis_nn_context ctx;
+    ET_CHECK_MSG(
+        reinterpret_cast<uintptr_t>(cmsis_workspace_ptr_) % 4 == 0,
+        "CMSIS workspace not 4-byte aligned");
+    ctx.buf = cmsis_workspace_ptr_;
+    ctx.size = get_cmsis_workspace_size();
+    return ctx;
+  }
+
+  bool is_kernel_sum_updated() const {
+    return header_->updated;
+  }
+
+  void compute_kernel_sums_if_needed() {
+    if (!header_->updated) {
+      arm_vector_sum_s8(
+          reinterpret_cast<int32_t*>(cmsis_workspace_ptr_),
+          in_features_,
+          out_features_,
+          get_weight_data(),
+          get_weight_zp_data()[0],
+          0,
+          get_bias_data());
+      header_->updated = true;
+      ET_LOG(
+          Info,
+          "Computed kernel sums. [required_bytes : %d]",
+          header_->required_size);
+    }
+  }
+
+  const int8_t* get_weight_data() const {
+    return reinterpret_cast<const int8_t*>(base_ptr_ + weight_data_offset_);
+  }
+
+  const int32_t* get_weight_zp_data() const {
+    return reinterpret_cast<const int32_t*>(base_ptr_ + weight_zp_data_offset_);
+  }
+
+  const int32_t* get_bias_data() const {
+    return bias_data_offset_ == 0
+        ? nullptr
+        : reinterpret_cast<const int32_t*>(base_ptr_ + bias_data_offset_);
+  }
+
+  bool is_per_channel_quant() const {
+    return is_per_channel_;
+  }
+  int32_t get_in_features() const {
+    return in_features_;
+  }
+  int32_t get_out_features() const {
+    return out_features_;
+  }
+
+ private:
+  static constexpr size_t KERNEL_SUM_HEADER_SIZE = 8;
+
+  // Header for kernel sum computation state only
+  struct KernelSumHeader {
+    bool updated = false;
+    int32_t required_size = 0;
+  };
+  static_assert(
+      sizeof(KernelSumHeader) == KERNEL_SUM_HEADER_SIZE,
+      "KernelSumHeader must be exactly 8 bytes");
+
+  int8_t* scratch_ptr_;
+  size_t total_size_;
+  uint8_t* base_ptr_;
+
+  // Context members
+  const int32_t in_features_;
+  const int32_t out_features_;
+  const bool is_per_channel_;
+  const uint32_t weight_data_offset_;
+  const uint32_t weight_zp_data_offset_;
+  const uint32_t bias_data_offset_;
+
+  KernelSumHeader* header_;
+  int8_t* cmsis_workspace_ptr_;
+
+  uint32_t calculate_offset(const void* ptr) const {
+    if (ptr == nullptr)
+      return 0;
+
+    const uint8_t* ptr_bytes = reinterpret_cast<const uint8_t*>(ptr);
+    ET_CHECK_MSG(ptr_bytes >= base_ptr_, "Pointer is before base address");
+
+    const std::ptrdiff_t offset = ptr_bytes - base_ptr_;
+    ET_CHECK_MSG(
+        offset >= 0 && offset <= UINT32_MAX, "Offset out of valid range");
+    return static_cast<uint32_t>(offset);
+  }
+
+  size_t get_cmsis_workspace_size() const {
+    return total_size_ - KERNEL_SUM_HEADER_SIZE;
+  }
+
+  void validate_size(const cmsis_nn_dims& filter_dims) const {
+    header_->required_size =
+        arm_fully_connected_s8_get_buffer_size(&filter_dims);
+
+    ET_CHECK_MSG(
+        get_cmsis_workspace_size() >=
+            static_cast<size_t>(header_->required_size),
+        "Scratch buffer size %zu insufficient for required size %d",
+        get_cmsis_workspace_size(),
+        header_->required_size);
+  }
+};
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
index 5ef2d9d4bf9..eaa7027e46c 100644
--- a/backends/cortex_m/ops/cortex_m_ops_common.h
+++ b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -22,6 +22,10 @@ using ScalarType = executorch::aten::ScalarType;
 using Scalar = torch::executor::Scalar;
 using Error = executorch::runtime::Error;
 
+// From arm_nn_math_types.h
+#define ARM_NN_Q31_MAX ((int32_t)(0x7FFFFFFFL))
+#define ARM_NN_Q31_MIN ((int32_t)(0x80000000L))
+
 // Basic tensor type / layout validation and dimension order checking
 inline void validate_cmsis_nn_tensor_requirements(
     const Tensor& input1,
@@ -32,16 +36,19 @@ inline void validate_cmsis_nn_tensor_requirements(
   // Basic dtype validation
   ET_CHECK_MSG(
       input1.scalar_type() == expected_dtype,
-      "Input1 dtype must be %hhd",
-      expected_dtype);
+      "Input1 dtype must be %hhd, got %hhd",
+      expected_dtype,
+      input1.scalar_type());
   ET_CHECK_MSG(
       input2.scalar_type() == expected_dtype,
-      "Input2 dtype must be %hhd",
-      expected_dtype);
+      "Input2 dtype must be %hhd, got %hhd",
+      expected_dtype,
+      input2.scalar_type());
   ET_CHECK_MSG(
       output.scalar_type() == expected_dtype,
-      "Output dtype must be %hhd",
-      expected_dtype);
+      "Output dtype must be %hhd, got %hhd",
+      expected_dtype,
+      output.scalar_type());
 
   // Dim order consistency
   ET_CHECK_MSG(
@@ -114,6 +121,33 @@ inline void validate_quantization_params(
       "Single quant Output");
 }
 
+// Refer to CMSIS-NN 'arm_nn_requantize' implementation for details:
+// https://github.com/ARM-software/CMSIS-NN/blob/main/Include/arm_nnsupportfunctions.h#L1625
+// multiplier: Range {ARM_NN_Q31_MIN + 1, Q32_MAX}
+// shift     : Range {-31, 30}
+inline bool validate_per_channel_quant_params(
+    const int32_t* multipliers,
+    const int32_t* shifts,
+    int num_channels) {
+  for (int i = 0; i < num_channels; ++i) {
+    // Multiplier: {ARM_NN_Q31_MIN + 1, ARM_NN_Q31_MAX}
+    if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
+      ET_LOG(
+          Error,
+          "weight_multiplier[%d] out of CMSIS-NN range: %d",
+          i,
+          multipliers[i]);
+      return false;
+    }
+    // Shift: {-31, 30} for arm_nn_requantize
+    if (shifts[i] < -31 || shifts[i] > 30) {
+      ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
+      return false;
+    }
+  }
+  return true;
+}
+
 inline Error resize_to_broadcast_target_size(
     const Tensor& input1,
     const Tensor& input2,
diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp
new file mode 100644
index 00000000000..d1ccb6d0d45
--- /dev/null
+++ b/backends/cortex_m/ops/op_quantized_linear.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cmsis_scratch_buffer_context.h"
+#include "cortex_m_ops_common.h"
+
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+Tensor& quantized_linear_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Scalar& input_zero_point,
+    const Scalar& input_multiplier,
+    const Scalar& input_shift,
+    const Tensor& weights,
+    const Tensor& weight_zero_point,
+    const Tensor& weight_multiplier,
+    const Tensor& weight_shift,
+    const torch::executor::optional<Tensor>& bias,
+    const Tensor& bias_multiplier,
+    const Tensor& bias_shift,
+    const Tensor& scratch_buffer,
+    const Scalar& output_zero_point,
+    const Scalar& in_features,
+    const Scalar& out_features,
+    Tensor& out) {
+  ET_LOG(Info, "quantized_linear_out: called");
+  validate_cmsis_nn_tensor_requirements(input, weights, out);
+
+  ET_CHECK_MSG(
+      scratch_buffer.scalar_type() == ScalarType::Char,
+      "Scratch buffer must be int8");
+
+  const int32_t batch_size = input.size(0);
+  const int32_t in_feat = static_cast<int32_t>(in_features.to<int64_t>());
+  const int32_t out_feat = static_cast<int32_t>(out_features.to<int64_t>());
+  const int32_t input_zp = static_cast<int32_t>(input_zero_point.to<int64_t>());
+  const int32_t output_zp =
+      static_cast<int32_t>(output_zero_point.to<int64_t>());
+  const bool is_per_channel = (weight_zero_point.numel() > 1);
+
+  const int8_t* input_data = input.const_data_ptr<int8_t>();
+  const int8_t* weight_data = weights.const_data_ptr<int8_t>();
+  const int32_t* bias_data =
+      bias.has_value() ? bias.value().const_data_ptr<int32_t>() : nullptr;
+  int8_t* output_data = out.mutable_data_ptr<int8_t>();
+  const int32_t* weight_zp_data = weight_zero_point.const_data_ptr<int32_t>();
+  const int32_t* weight_mult_data = weight_multiplier.const_data_ptr<int32_t>();
+  const int32_t* weight_shift_data = weight_shift.const_data_ptr<int32_t>();
+
+  if (!validate_per_channel_quant_params(
+          weight_mult_data, weight_shift_data, out_feat)) {
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  // Initialize scratch buffer context (validates early)
+  CMSISScratchBufferContext scratch_ctx(
+      const_cast<Tensor&>(scratch_buffer), weights, weight_zero_point, bias);
+
+  scratch_ctx.compute_kernel_sums_if_needed();
+  cmsis_nn_context ctx = scratch_ctx.get_cmsis_ctx();
+
+  // Setup CMSIS-NN parameters
+  cmsis_nn_fc_params fc_params;
+  fc_params.input_offset = -input_zp;
+  fc_params.output_offset = output_zp;
+  fc_params.activation.min = std::numeric_limits<int8_t>::min();
+  fc_params.activation.max = std::numeric_limits<int8_t>::max();
+
+  cmsis_nn_dims input_dims = {1, 1, 1, in_feat};
+  cmsis_nn_dims filter_dims = {in_feat, 1, 1, out_feat};
+  cmsis_nn_dims bias_dims = {1, 1, 1, out_feat};
+  cmsis_nn_dims output_dims = {1, 1, 1, out_feat};
+
+  arm_cmsis_nn_status status;
+  for (int32_t b = 0; b < batch_size; b++) {
+    const int8_t* batch_input = input_data + b * in_feat;
+    int8_t* batch_output = output_data + b * out_feat;
+
+    ET_CHECK_MSG(
+        batch_input != nullptr && weight_data != nullptr,
+        "Null input pointers");
+    ET_CHECK_MSG(in_feat > 0 && out_feat > 0, "Invalid dimensions");
+
+    if (is_per_channel) {
+      cmsis_nn_per_channel_quant_params per_channel_quant_params;
+      per_channel_quant_params.multiplier =
+          const_cast<int32_t*>(weight_mult_data);
+      per_channel_quant_params.shift = const_cast<int32_t*>(weight_shift_data);
+
+      status = arm_fully_connected_per_channel_s8(
+          &ctx,
+          &fc_params,
+          &per_channel_quant_params,
+          &input_dims,
+          batch_input,
+          &filter_dims,
+          weight_data,
+          &bias_dims,
+          bias_data,
+          &output_dims,
+          batch_output);
+    } else {
+      fc_params.filter_offset = -weight_zp_data[0];
+      cmsis_nn_per_tensor_quant_params per_tensor_quant_params;
+      per_tensor_quant_params.multiplier = weight_mult_data[0];
+      per_tensor_quant_params.shift = weight_shift_data[0];
+
+      status = arm_fully_connected_s8(
+          &ctx,
+          &fc_params,
+          &per_tensor_quant_params,
+          &input_dims,
+          batch_input,
+          &filter_dims,
+          weight_data,
+          &bias_dims,
+          bias_data,
+          &output_dims,
+          batch_output);
+    }
+
+    if (status != ARM_CMSIS_NN_SUCCESS) {
+      ET_LOG(
+          Error,
+          "quantized_linear_out: CMSIS-NN failed with status [%d]",
+          status);
+      context.fail(Error::Internal);
+      return out;
+    }
+  }
+  return out;
+}
+
+// Functional variant (stub, not used at runtime)
+Tensor quantized_linear(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Scalar& input_zero_point,
+    const Scalar& input_multiplier,
+    const Scalar& input_shift,
+    const Tensor& weights,
+    const Tensor& weight_zero_point,
+    const Tensor& weight_multiplier,
+    const Tensor& weight_shift,
+    const torch::executor::optional<Tensor>& bias,
+    const Tensor& bias_multiplier,
+    const Tensor& bias_shift,
+    const Tensor& scratch_buffer,
+    const Scalar& output_zero_point,
+    const Scalar& in_features,
+    const Scalar& out_features) {
+  ET_LOG(Info, "quantized_linear: called");
+  assert(false);
+  return const_cast<Tensor&>(input);
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index 926dcd85e4b..d642531e950 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -223,3 +223,216 @@ def quantized_add_out_impl(
     out.copy_(result_quantized)
 
     return out
+
+
+# ===================================================================
+# QUANTIZED LINEAR OPERATION DEFINITION
+# ===================================================================
+
+
+def _check_per_tensor_or_per_channel(param: torch.Tensor, out_channels: int, name: str):
+    assert param.numel() in [
+        1,
+        out_channels,
+    ], f"{name} must be per-tensor (1) or per-channel ({out_channels}), got {param.numel()}"
+
+
+lib.define(
+    "quantized_linear.out("
+    "Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, "
+    "Tensor weights, "
+    "Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, "
+    "Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, "
+    "Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features, "
+    "*, Tensor(a!) out) -> Tensor(a!)"
+)
+
+# Define functional variant (non-out version)
+lib.define(
+    "quantized_linear("
+    "Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, "
+    "Tensor weights, "
+    "Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, "
+    "Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, "
+    "Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features"
+    ") -> Tensor"
+)
+
+
+# Fake meta function for shape inference (out variant)
+@register_fake("cortex_m::quantized_linear.out")
+def quantized_linear_out_meta(
+    input: torch.Tensor,
+    input_zero_point: int,
+    input_multiplier: int,
+    input_shift: int,
+    weights: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_multiplier: torch.Tensor,
+    weight_shift: torch.Tensor,
+    bias: torch.Tensor,
+    bias_multiplier: torch.Tensor,
+    bias_shift: torch.Tensor,
+    scratch_buffer: torch.Tensor,
+    output_zero_point: int,
+    in_features: int,
+    out_features: int,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    # Validate dimensions
+    batch_size = input.shape[0]
+    out_channels = weights.shape[0]
+
+    # Validate weight quantization parameters dimensions
+    _check_per_tensor_or_per_channel(
+        weight_zero_point, out_channels, "weight_zero_point"
+    )
+    _check_per_tensor_or_per_channel(
+        weight_multiplier, out_channels, "weight_multiplier"
+    )
+    _check_per_tensor_or_per_channel(weight_shift, out_channels, "weight_shift")
+
+    # Validate output shape
+    expected_shape = (batch_size, out_channels)
+    assert (
+        out.shape == expected_shape
+    ), f"Output shape {out.shape} must be {expected_shape}"
+
+    return out
+
+
+# Fake meta function for shape inference (functional variant)
+@register_fake("cortex_m::quantized_linear")
+def quantized_linear_meta(
+    input: torch.Tensor,
+    input_zero_point: int,
+    input_multiplier: int,
+    input_shift: int,
+    weights: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_multiplier: torch.Tensor,
+    weight_shift: torch.Tensor,
+    bias: torch.Tensor,
+    bias_multiplier: torch.Tensor,
+    bias_shift: torch.Tensor,
+    scratch_buffer: torch.Tensor,
+    output_zero_point: int,
+    in_features: int,
+    out_features: int,
+) -> torch.Tensor:
+    # Validate dimensions (same as out variant)
+    batch_size = input.shape[0]
+    out_channels = weights.shape[0]
+
+    # Validate weight quantization parameters dimensions
+    _check_per_tensor_or_per_channel(
+        weight_zero_point, out_channels, "weight_zero_point"
+    )
+    _check_per_tensor_or_per_channel(
+        weight_multiplier, out_channels, "weight_multiplier"
+    )
+    _check_per_tensor_or_per_channel(weight_shift, out_channels, "weight_shift")
+
+    # Calculate output shape for functional variant
+    output_shape = (batch_size, out_channels)
+    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+
+
+@impl(lib, "quantized_linear.out", "CompositeExplicitAutograd")
+def quantized_linear_out_impl(
+    input: torch.Tensor,
+    input_zero_point: int,
+    input_multiplier: int,
+    input_shift: int,
+    weights: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_multiplier: torch.Tensor,
+    weight_shift: torch.Tensor,
+    bias: torch.Tensor,
+    bias_multiplier: torch.Tensor,
+    bias_shift: torch.Tensor,
+    scratch_buffer: torch.Tensor,
+    output_zero_point: int,
+    in_features: int,
+    out_features: int,
+    *,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Fallback implementation for meta/testing
+    Note: This won't be called at runtime, only during compilation
+    """
+
+    # Per-channel dequantization
+    input_scale = input_multiplier * (2.0 ** (-input_shift))
+    input_fp = (input.float() - input_zero_point) * input_scale
+    if weight_zero_point.numel() == 1:
+        # Per-tensor
+        weight_scale = weight_multiplier.item() * (2.0 ** (-weight_shift.item()))
+        weights_fp = (weights.float() - weight_zero_point.item()) * weight_scale
+    else:
+        # Per-channel
+        weight_scales = weight_multiplier.float() * (2.0 ** (-weight_shift.float()))
+        weights_fp = (
+            weights.float() - weight_zero_point.float().unsqueeze(1)
+        ) * weight_scales.unsqueeze(1)
+    bias_fp = None
+    if bias is not None:
+        bias_scales = bias_multiplier.float() * (2.0 ** (-bias_shift.float()))
+        bias_fp = bias.float() * bias_scales
+
+        result_fp = torch.nn.functional.linear(input_fp, weights_fp, bias_fp)
+    else:
+        result_fp = torch.nn.functional.linear(input_fp, weights_fp)
+    result_quantized = torch.clamp(
+        torch.round(result_fp + output_zero_point), -128, 127
+    ).to(torch.int8)
+    out.copy_(result_quantized)
+    return out
+
+
+# Functional variant implementation
+@impl(lib, "quantized_linear", "CompositeExplicitAutograd")
+def quantized_linear_impl(
+    input: torch.Tensor,
+    input_zero_point: int,
+    input_multiplier: int,
+    input_shift: int,
+    weights: torch.Tensor,
+    weight_zero_point: torch.Tensor,
+    weight_multiplier: torch.Tensor,
+    weight_shift: torch.Tensor,
+    bias: torch.Tensor,
+    bias_multiplier: torch.Tensor,
+    bias_shift: torch.Tensor,
+    scratch_buffer: torch.Tensor,
+    output_zero_point: int,
+    in_features: int,
+    out_features: int,
+) -> torch.Tensor:
+    """
+    Functional variant - creates output tensor and calls out variant
+    """
+    # Create output tensor
+    batch_size = input.shape[0]
+    output = torch.empty(
+        (batch_size, out_features), dtype=torch.int8, device=input.device
+    )
+    return quantized_linear_out_impl(
+        input,
+        input_zero_point,
+        input_multiplier,
+        input_shift,
+        weights,
+        weight_zero_point,
+        weight_multiplier,
+        weight_shift,
+        bias,
+        bias_multiplier,
+        bias_shift,
+        scratch_buffer,
+        output_zero_point,
+        in_features,
+        out_features,
+        out=output,
+    )
diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
index f2615a1f525..b41c0c68fa5 100644
--- a/backends/cortex_m/ops/operators.yaml
+++ b/backends/cortex_m/ops/operators.yaml
@@ -27,3 +27,15 @@
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::quantized_add_out
+
+- func: cortex_m::quantized_linear(Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, Tensor weights, Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features) -> Tensor
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_linear
+
+- func: cortex_m::quantized_linear.out(Tensor input, Scalar input_zero_point, Scalar input_multiplier, Scalar input_shift, Tensor weights, Tensor weight_zero_point, Tensor weight_multiplier, Tensor weight_shift, Tensor? bias, Tensor bias_multiplier, Tensor bias_shift, Tensor scratch_buffer, Scalar output_zero_point, Scalar in_features, Scalar out_features, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::quantized_linear_out
diff --git a/backends/cortex_m/passes/passes_utils.py b/backends/cortex_m/passes/passes_utils.py
index 3f6e05fc4de..7155f997bf4 100644
--- a/backends/cortex_m/passes/passes_utils.py
+++ b/backends/cortex_m/passes/passes_utils.py
@@ -8,6 +8,10 @@
 
 import torch
 
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from torch.fx import Node
+
 
 def dequantize_per_tensor_cmsis(
     qtensor: torch.Tensor, zero_point: int, multiplier: int, shift: int
@@ -92,3 +96,58 @@ def quantize_multiplier_aot(scale: float) -> tuple[int, int]:
 def cleanup_erased_nodes(graph_module: torch.fx.GraphModule):
     # Placeholder for any additional cleanup if needed
     pass
+
+
+def transfer_metadata(
+    new_node: Node, source_node: Node, pass_name: str = "QuantizedPass"
+) -> None:
+    """Transfer metadata with proper provenance tracking."""
+    if hasattr(source_node, "meta") and source_node.meta:
+        new_node.meta = source_node.meta.copy()
+        if "from_node" in new_node.meta:
+            from_node_list = new_node.meta.get("from_node", []).copy()
+            from_node_list.append(
+                {"source": source_node.name, "pass": pass_name, "op": "fuse"}
+            )
+            new_node.meta["from_node"] = from_node_list
+        for field in ["tensor_meta", "stack_trace"]:
+            if field in source_node.meta:
+                new_node.meta[field] = source_node.meta[field]
+
+
+def is_dequant_node(node: Node) -> bool:
+    """Check if node is a dequantize operation."""
+    dequant_targets = {
+        exir_ops.edge.cortex_m.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+    }
+    return node.op == "call_function" and node.target in dequant_targets
+
+
+def is_quant_node(node: Node) -> bool:
+    """Check if node is a quantize operation."""
+    quant_targets = {
+        exir_ops.edge.cortex_m.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+    }
+    return node.op == "call_function" and node.target in quant_targets
+
+
+def cleanup_nodes(nodes_to_erase, graph):
+    """Clean up marked nodes from graph."""
+    failed_nodes = []
+
+    for node in reversed(nodes_to_erase):
+        if node in graph.nodes and len(node.users) == 0:
+            try:
+                graph.erase_node(node)
+            except Exception as e:
+                print(f"Warning: Failed to erase node {node}: {e}")
+                failed_nodes.append(node)
+                continue
+
+    if failed_nodes:
+        print(f"Warning: {len(failed_nodes)} nodes could not be erased")
+
+    return failed_nodes
diff --git a/backends/cortex_m/passes/quantized_linear_fusion_pass.py b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
new file mode 100644
index 00000000000..11a49beb2f4
--- /dev/null
+++ b/backends/cortex_m/passes/quantized_linear_fusion_pass.py
@@ -0,0 +1,646 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Optional
+
+import executorch.backends.cortex_m.ops.operators  # noqa
+import torch
+import torch.fx
+
+from executorch.backends.cortex_m.passes.passes_utils import (
+    cleanup_nodes,
+    is_dequant_node,
+    quantize_multiplier_aot,
+    transfer_metadata,
+)
+
+from executorch.backends.transforms.utils import create_mutable_buffer, get_param_tensor
+
+from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
+from executorch.exir import ExportedProgram
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.fx import Node
+from torch.fx.passes.infra.pass_manager import PassResult
+
+logger = logging.getLogger("quantized_linear_fusion_pass")
+logger.setLevel(logging.INFO)
+
+
+class QuantizedLinearFusionPass(XNNPACKPass):
+    """
+    Cortex-M backend pass that fuses quantized linear-like patterns.
+    Fuses: dequantize -> [linear/addmm/fc_ops] -> quantize
+    Into: cortex_m.quantized_linear.default with direct parameters.
+    """
+
+    SUPPORTED_OPS_MAPPING = {
+        exir_ops.edge.aten.addmm.default: exir_ops.edge.cortex_m.quantized_linear.default,
+        exir_ops.edge.aten.mm.default: exir_ops.edge.cortex_m.quantized_linear.default,
+    }
+
+    requires_exported_program = True
+
+    def __init__(self, exported_program: ExportedProgram):
+        super().__init__(exported_program)
+        self.nodes_to_erase = []
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        logger.info("Starting QuantizedLinearFusionPass")
+        assert id(self._exported_program.graph_module.graph) == id(
+            graph_module.graph
+        ), "QuantizedLinearFusionPass requires same graph instance"
+
+        try:
+            fusion_count = self._fuse_quantized_linear_patterns(graph_module)
+            if fusion_count > 0:
+                graph_module.graph.eliminate_dead_code()
+                graph_module.graph.lint()
+                graph_module.recompile()
+            logger.info(f"Linear fusion completed: {fusion_count} patterns fused")
+            return PassResult(graph_module, fusion_count > 0)
+        except Exception as e:
+            logger.error(f"Error in QuantizedLinearFusionPass: {e}")
+            raise e
+
+    def _extract_linear_pattern(self, quantize_node: Node):
+        if not quantize_node.args:
+            return None
+        fc_node = quantize_node.args[0]
+        if not (
+            fc_node.op == "call_function"
+            and fc_node.target in self.SUPPORTED_OPS_MAPPING
+        ):
+            return None
+
+        op_name = str(fc_node.target).split(".")[-1]
+
+        if "addmm" in str(fc_node.target):
+            input_dq_node = fc_node.args[1]
+        else:
+            input_dq_node = fc_node.args[0]
+        if not is_dequant_node(input_dq_node):
+            logger.info("input_dq_node is not a dequant node")
+            return None
+        weight_dq_node, bias_dq_node = self._extract_weight_bias_from_fc_op(fc_node)
+        if not weight_dq_node:
+            logger.info("No weight, bias dequantize node found")
+            return None
+        return (
+            quantize_node,
+            fc_node,
+            input_dq_node,
+            weight_dq_node,
+            bias_dq_node,
+            op_name,
+        )
+
+    def _extract_weight_bias_from_fc_op(self, fc_node: Node):
+        """Generic extraction for FC-like operations."""
+
+        if "addmm" in str(fc_node.target):
+            if len(fc_node.args) >= 3:
+                bias_arg = fc_node.args[0]
+                weight_arg = fc_node.args[2]
+                weight_dq_node = self._trace_to_dequantize(weight_arg)
+                logger.info(
+                    f"weight_arg: {weight_arg}, traced weight_dq_node: {weight_dq_node}"
+                )
+
+                if weight_dq_node is None:
+                    logger.info("No weight dequantize node found ")
+
+                # For bias, try to trace to dequantize but allow None (no-bias case)
+                bias_dq_node = self._trace_to_dequantize(bias_arg)
+                if bias_dq_node is None:
+                    logger.info("No bias dequantize node found - likely no-bias linear")
+                return weight_dq_node, bias_dq_node
+        elif any(op in str(fc_node.target) for op in ["linear", "mm"]):
+            if len(fc_node.args) >= 2:
+                weight_arg = fc_node.args[1]
+                bias_arg = fc_node.args[2] if len(fc_node.args) > 2 else None
+                weight_dq_node = self._trace_to_dequantize(weight_arg)
+                bias_dq_node = self._trace_to_dequantize(bias_arg) if bias_arg else None
+                return weight_dq_node, bias_dq_node
+        return None, None
+
+    def _extract_input_quantization_parameters(
+        self, input_dq_node: Node
+    ) -> Optional[dict]:
+        """Extract input quantization parameters from dequantize node."""
+        try:
+            # Find the quantize operation that produces the int8 tensor
+            input_quantize_node = None
+            if hasattr(input_dq_node, "args") and input_dq_node.args:
+                quantize_candidate = input_dq_node.args[0]
+                if getattr(
+                    quantize_candidate, "op", None
+                ) == "call_function" and "quantize" in str(
+                    getattr(quantize_candidate, "target", "")
+                ):
+                    input_quantize_node = quantize_candidate
+
+            if not input_quantize_node:
+                logger.error("Could not find quantize node for input!")
+                return None
+
+            # Extract input quantization parameters
+            input_scale = self._extract_param_value(input_dq_node.args[1])
+            input_zero_point = int(self._extract_param_value(input_dq_node.args[2]))
+            input_multiplier, input_shift = quantize_multiplier_aot(input_scale)
+
+            return {
+                "input_scale": input_scale,
+                "input_zero_point": input_zero_point,
+                "input_multiplier": input_multiplier,
+                "input_shift": input_shift,
+                "input_tensor": input_quantize_node,
+            }
+        except Exception as e:
+            logger.error(f"Failed to extract input quantization parameters: {e}")
+            return None
+
+    def _extract_output_quantization_parameters(
+        self, quantize_node: Node
+    ) -> Optional[dict]:
+        """Extract output quantization parameters from quantize node."""
+        try:
+            output_scale = self._extract_param_value(quantize_node.args[1])
+            output_zero_point = int(self._extract_param_value(quantize_node.args[2]))
+
+            return {
+                "output_scale": output_scale,
+                "output_zero_point": output_zero_point,
+            }
+        except Exception as e:
+            logger.error(f"Failed to extract output quantization parameters: {e}")
+            return None
+
+    def _create_constant_parameter_buffer(
+        self, graph, quantize_node: Node, data: torch.Tensor, name: str
+    ):
+        """Create a parameter buffer"""
+        buffer_name = f"{name}_{id(quantize_node)}"
+
+        setattr(graph.owning_module, buffer_name, data)
+
+        # Create a get_attr node
+        with graph.inserting_before(quantize_node):
+            buffer_node = graph.create_node(
+                op="get_attr", target=buffer_name, name=buffer_name
+            )
+
+            # Set metadata
+            buffer_node.meta["val"] = data
+
+        return buffer_node
+
+    def _extract_weight_parameters(self, weight_dq_node: Node) -> Optional[dict]:
+        try:
+            weight_tensor = weight_dq_node.args[0]
+            weight_scale = weight_dq_node.args[1]
+            weight_zero_point = (
+                weight_dq_node.args[2] if len(weight_dq_node.args) > 2 else None
+            )
+
+            weight_scale_data = self._extract_param_value(weight_scale)
+            weight_zp_data = (
+                self._extract_param_value(weight_zero_point)
+                if weight_zero_point
+                else None
+            )
+
+            # Get actual tensor data to determine output features
+            weight_tensor_data = get_param_tensor(self._exported_program, weight_tensor)
+            out_features = weight_tensor_data.shape[0]
+
+            # Handle both per-tensor and per-channel
+            if (
+                isinstance(weight_scale_data, torch.Tensor)
+                and weight_scale_data.numel() > 1
+            ):
+                # Per-channel: ensure we have the right number of elements
+                assert (
+                    weight_scale_data.numel() == out_features
+                ), f"Scale size {weight_scale_data.numel()} != out_features {out_features}"
+
+                multipliers = []
+                shifts = []
+                for scale in weight_scale_data:
+                    mult, shift = quantize_multiplier_aot(scale.item())
+                    multipliers.append(mult)
+                    shifts.append(shift)
+
+                weight_multiplier = torch.tensor(multipliers, dtype=torch.int32)
+                weight_shift = torch.tensor(shifts, dtype=torch.int32)
+                weight_zp_tensor = (
+                    weight_zp_data.int()
+                    if weight_zp_data is not None
+                    else torch.zeros(out_features, dtype=torch.int32)
+                )
+            else:
+                # Per-tensor: create tensors with correct size for output features
+                scale_val = (
+                    weight_scale_data.item()
+                    if isinstance(weight_scale_data, torch.Tensor)
+                    else weight_scale_data
+                )
+                mult, shift = quantize_multiplier_aot(scale_val)
+
+                # Create tensors sized for out_features (not single element)
+                weight_multiplier = torch.full((out_features,), mult, dtype=torch.int32)
+                weight_shift = torch.full((out_features,), shift, dtype=torch.int32)
+                weight_zp_tensor = torch.full(
+                    (out_features,),
+                    weight_zp_data if weight_zp_data else 0,
+                    dtype=torch.int32,
+                )
+
+            # Validate multipliers
+            for i, mult in enumerate(weight_multiplier):
+                if mult < (1 << 30) or mult > ((1 << 31) - 1):
+                    logger.error(
+                        f"Invalid multiplier[{i}]: {mult}, scale was: {weight_scale_data}"
+                    )
+                    return None
+
+            return {
+                "weight_tensor": weight_tensor,
+                "weight_zero_point_data": weight_zp_tensor,
+                "weight_multiplier_data": weight_multiplier,
+                "weight_shift_data": weight_shift,
+            }
+        except Exception as e:
+            logger.error(f"Failed to extract weight parameters: {e}")
+            return None
+
+    def _extract_bias_parameters(self, bias_dq_node: Optional[Node]) -> Optional[dict]:
+        """
+        Extract bias parameters for quantized linear fusion.
+        Handles both dequantized bias nodes and constant bias tensors.
+        Returns a dict with bias_tensor, bias_multiplier, and bias_shift.
+        """
+        if not bias_dq_node:
+            # No bias present
+            return None
+        try:
+            # Case 1: Bias is a dequantize node
+            if hasattr(bias_dq_node, "op") and is_dequant_node(bias_dq_node):
+                bias_tensor = bias_dq_node.args[0]
+                bias_scale = bias_dq_node.args[1]
+
+                bias_scale_data = self._extract_param_value(bias_scale)
+
+                if (
+                    isinstance(bias_scale_data, torch.Tensor)
+                    and bias_scale_data.numel() > 1
+                ):
+                    # Per-channel bias
+                    bias_multipliers = []
+                    bias_shifts = []
+                    for scale_val in bias_scale_data.tolist():
+                        mult, shift = quantize_multiplier_aot(scale_val)
+                        bias_multipliers.append(mult)
+                        bias_shifts.append(shift)
+                    return {
+                        "bias_tensor": bias_tensor,
+                        "bias_multiplier": bias_multipliers,
+                        "bias_shift": bias_shifts,
+                    }
+                else:
+                    # Per-tensor bias
+                    bias_scale_val = (
+                        bias_scale_data.item()
+                        if isinstance(bias_scale_data, torch.Tensor)
+                        else bias_scale_data
+                    )
+                    bias_multiplier, bias_shift = quantize_multiplier_aot(
+                        bias_scale_val
+                    )
+                    return {
+                        "bias_tensor": bias_tensor,
+                        "bias_multiplier": bias_multiplier,
+                        "bias_shift": bias_shift,
+                    }
+            else:
+                # Case 2: Bias is a constant tensor (not dequantized)
+                # This can happen if bias is not quantized in the model
+                bias_tensor = bias_dq_node
+                # Use default multiplier/shift for unquantized bias
+                bias_multiplier = 1
+                bias_shift = 0
+                return {
+                    "bias_tensor": bias_tensor,
+                    "bias_multiplier": bias_multiplier,
+                    "bias_shift": bias_shift,
+                }
+        except Exception as e:
+            logger.error(f"Failed to extract bias parameters: {e}")
+            return None
+
+    def _prepare_bias_tensors(
+        self, bias_params: Optional[dict], out_features: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare bias multiplier and shift tensors for kernel call.
+        Returns (bias_multiplier_tensor, bias_shift_tensor) both sized [out_features].
+        """
+        if bias_params:
+            bias_multiplier = bias_params["bias_multiplier"]
+            bias_shift = bias_params["bias_shift"]
+
+            # Convert to tensors of the right size
+            if isinstance(bias_multiplier, int):
+                bias_multiplier_tensor = torch.full(
+                    [out_features], bias_multiplier, dtype=torch.int32
+                )
+            elif isinstance(bias_multiplier, list):
+                assert (
+                    len(bias_multiplier) == out_features
+                ), f"Bias multiplier size {len(bias_multiplier)} != out_features {out_features}"
+                bias_multiplier_tensor = torch.tensor(
+                    bias_multiplier, dtype=torch.int32
+                )
+            elif isinstance(bias_multiplier, torch.Tensor):
+                assert (
+                    bias_multiplier.numel() == out_features
+                ), f"Bias multiplier size {bias_multiplier.numel()} != out_features {out_features}"
+                bias_multiplier_tensor = bias_multiplier
+            else:
+                raise TypeError(
+                    f"Unsupported bias_multiplier type: {type(bias_multiplier)}"
+                )
+
+            if isinstance(bias_shift, int):
+                bias_shift_tensor = torch.full(
+                    [out_features], bias_shift, dtype=torch.int32
+                )
+            elif isinstance(bias_shift, list):
+                assert (
+                    len(bias_shift) == out_features
+                ), f"Bias shift size {len(bias_shift)} != out_features {out_features}"
+                bias_shift_tensor = torch.tensor(bias_shift, dtype=torch.int32)
+            elif isinstance(bias_shift, torch.Tensor):
+                assert (
+                    bias_shift.numel() == out_features
+                ), f"Bias shift size {bias_shift.numel()} != out_features {out_features}"
+                bias_shift_tensor = bias_shift
+            else:
+                raise TypeError(f"Unsupported bias_shift type: {type(bias_shift)}")
+
+            return bias_multiplier_tensor, bias_shift_tensor
+        else:
+            # No bias: return zero tensors of correct shape
+            return (
+                torch.zeros([out_features], dtype=torch.int32),
+                torch.zeros([out_features], dtype=torch.int32),
+            )
+
+    def _extract_param_value(self, node_or_value):
+        """
+        Extract a scalar value from a Node or a direct float/int.
+        """
+        if isinstance(node_or_value, (float, int)):
+            return node_or_value
+        # If it's a tensor, get its scalar value if possible
+        if isinstance(node_or_value, torch.Tensor):
+            return node_or_value.item() if node_or_value.numel() == 1 else node_or_value
+        # If it's a Node, use get_param_tensor
+        if hasattr(node_or_value, "op"):
+            tensor = get_param_tensor(self._exported_program, node_or_value)
+            return tensor.item() if tensor.numel() == 1 else tensor
+        raise TypeError(f"Unsupported parameter type: {type(node_or_value)}")
+
+    def _calculate_cmsis_scratch_size(self, weight_tensor) -> int:
+        """Calculate CMSIS-NN scratch buffer size for quantized linear operations.
+
+        Source: CMSIS-NN arm_fully_connected_s8_get_buffer_size() returns filter_dims->w * sizeof(int32_t).
+        This buffer stores pre-computed kernel sums (weight row sums) - one int32_t per output feature.
+        Same buffer size applies to both per-tensor and per-channel quantization paths since both use
+        identical kernel sum optimization in the underlying matrix multiplication.
+        """
+        try:
+            print(f"weight_tensor type: {type(weight_tensor)}, value: {weight_tensor}")
+            weight_shape = get_param_tensor(self._exported_program, weight_tensor).shape
+            out_features = weight_shape[0]  # filter_dims->w in CMSIS terms
+
+            # CMSIS-NN implementation expects the following size
+            cmsis_buffer_size = out_features * 4  # sizeof(int32_t)
+            return cmsis_buffer_size
+        except Exception as e:
+            logger.error(f"Failed to calculate CMSIS scratch size: {e}")
+            return 2048  # Fallback
+
+    def _create_scratch_buffer(self, graph, quantize_node: Node, weight_tensor):
+        cmsis_scratch = self._calculate_cmsis_scratch_size(weight_tensor)
+
+        kernel_sum_header = 8  # sizeof(KernelSumHeader)
+        total_size = kernel_sum_header + cmsis_scratch
+
+        logger.info(
+            f"Kernel sum header: {kernel_sum_header}, CMSIS buffer: {cmsis_scratch}, total: {total_size}"
+        )
+
+        return create_mutable_buffer(
+            self._exported_program,
+            name=f"b_cmsis_linear_scratch_{id(quantize_node)}",
+            data=torch.zeros((total_size,), dtype=torch.int8),
+        )
+
+    def _create_fused_node(
+        self,
+        graph,
+        quantize_node: Node,
+        quant_params: dict,
+        weight_params: dict,
+        bias_params: Optional[dict],
+        quantized_target,
+    ) -> Node:
+        """Generic fused node creation for any FC-like operation."""
+        # Extract all parameters
+        input_tensor = quant_params["input_tensor"]
+        input_zp = quant_params["input_zero_point"]
+        input_multiplier = quant_params["input_multiplier"]
+        input_shift = quant_params["input_shift"]
+        weight_tensor = weight_params["weight_tensor"]
+
+        weight_zp_node = self._create_constant_parameter_buffer(
+            graph, quantize_node, weight_params["weight_zero_point_data"], "weight_zp"
+        )
+        weight_mult_node = self._create_constant_parameter_buffer(
+            graph, quantize_node, weight_params["weight_multiplier_data"], "weight_mult"
+        )
+        weight_shift_node = self._create_constant_parameter_buffer(
+            graph, quantize_node, weight_params["weight_shift_data"], "weight_shift"
+        )
+        # Get dimensions
+        weight_shape = get_param_tensor(self._exported_program, weight_tensor).shape
+        assert (
+            len(weight_shape) == 2
+        ), f"Weight tensor must be 2D, got shape {weight_shape}"
+        in_features = weight_shape[1]
+        out_features = weight_shape[0]
+
+        # Handle bias
+        bias_tensor = bias_params["bias_tensor"] if bias_params else None
+        bias_multiplier, bias_shift = self._prepare_bias_tensors(
+            bias_params, out_features
+        )
+        output_zp = quant_params["output_zero_point"]
+
+        scratch_buffer = self._create_scratch_buffer(
+            graph, quantize_node, weight_tensor
+        )
+
+        with graph.inserting_after(quantize_node):
+            fused = graph.create_node(
+                "call_function",
+                target=quantized_target,
+                args=(
+                    input_tensor,
+                    input_zp,
+                    input_multiplier,
+                    input_shift,
+                    weight_tensor,
+                    weight_zp_node,
+                    weight_mult_node,
+                    weight_shift_node,
+                    bias_tensor,
+                    bias_multiplier,
+                    bias_shift,
+                    scratch_buffer,
+                    output_zp,
+                    in_features,
+                    out_features,
+                ),
+                kwargs={},
+            )
+
+            transfer_metadata(fused, quantize_node, "QuantizedLinearFusionPass")
+        return fused
+
+    def _mark_for_cleanup(self, nodes):
+        for node in nodes:
+            if node is not None:
+                self.nodes_to_erase.append(node)
+
+    def _cleanup_nodes(self, graph):
+        cleanup_nodes(self.nodes_to_erase, graph)
+        self.nodes_to_erase.clear()
+
+    def _extract_linear_pattern_with_validation(self, quantize_node: Node):
+        pattern_info = self._extract_linear_pattern(quantize_node)
+        if not pattern_info:
+            return None
+        # Optionally add more validation here if needed
+        return pattern_info
+
+    def _trace_to_dequantize(self, node: Optional[Node], max_depth=3) -> Optional[Node]:
+        """Trace through transformations to find dequantize node."""
+        current_node = node
+        depth = 0
+        while current_node and depth < max_depth:
+            if is_dequant_node(current_node):
+                return current_node
+            if current_node.op == "call_function" and current_node.target in {
+                exir_ops.edge.aten.permute_copy.default,
+                exir_ops.edge.aten.view_copy.default,
+            }:
+                if current_node.args:
+                    current_node = current_node.args[0]
+                    depth += 1
+                    continue
+            break
+        return None
+
+    def _fuse_quantized_linear_patterns(
+        self, graph_module: torch.fx.GraphModule
+    ) -> int:
+        fusion_count = 0
+        graph = graph_module.graph
+        for node in list(graph.nodes):
+            if not (
+                node.op == "call_function" and "quantize_per_tensor" in str(node.target)
+            ):
+                continue
+            pattern_info = self._extract_linear_pattern_with_validation(node)
+            if not pattern_info:
+                continue
+
+            (
+                quantize_node,
+                fc_node,
+                input_dq_node,
+                weight_dq_node,
+                bias_dq_node,
+                op_name,
+            ) = pattern_info
+
+            # Get quantized target for this FC operation
+            quantized_target = self.SUPPORTED_OPS_MAPPING.get(fc_node.target)
+            if not quantized_target:
+                logger.warning(f"No quantized target found for {fc_node.target}")
+                continue
+
+            logger.info(f"✅ Found complete cortex_m Q/DQ + {op_name} pattern!")
+
+            try:
+                input_params = self._extract_input_quantization_parameters(
+                    input_dq_node
+                )
+                if not input_params:
+                    logger.error(
+                        "Quantization parameter extraction failed for node: %s", node
+                    )
+                    return None
+                output_params = self._extract_output_quantization_parameters(
+                    quantize_node
+                )
+                if not output_params:
+                    logger.error(
+                        "Output quantization parameter extraction failed for node: %s",
+                        node,
+                    )
+                    return None
+                quant_params = {**input_params, **output_params}
+                logger.info(f"Quantization parameters: {quant_params}")
+
+                weight_params = self._extract_weight_parameters(weight_dq_node)
+                if not weight_params:
+                    continue
+                bias_params = self._extract_bias_parameters(bias_dq_node)
+                if bias_dq_node and not bias_params:
+                    continue
+                fused_node = self._create_fused_node(
+                    graph,
+                    quantize_node,
+                    quant_params,
+                    weight_params,
+                    bias_params,
+                    quantized_target,
+                )
+                logger.info(f"Created fused {op_name} node: {fused_node}")
+
+                quantize_node.replace_all_uses_with(fused_node)
+                self._mark_for_cleanup(
+                    [
+                        quantize_node,
+                        fc_node,
+                        input_dq_node,
+                        weight_dq_node,
+                        bias_dq_node,
+                    ]
+                )
+                fusion_count += 1
+                logger.info(f"✅ Successfully fused {op_name} operation {fusion_count}")
+            except Exception as e:
+                logger.error(
+                    f"Failed to fuse {op_name} pattern for {fc_node.name}: {e}"
+                )
+                continue
+        self._cleanup_nodes(graph)
+        return fusion_count
diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py
index ca6d8b97795..eebf6866d83 100644
--- a/backends/cortex_m/passes/quantized_op_fusion_pass.py
+++ b/backends/cortex_m/passes/quantized_op_fusion_pass.py
@@ -36,7 +36,7 @@ class QuantizedOpFusionPass(ExportPass):
     # Generic operation mapping
     SUPPORTED_OPS_MAPPING = {
         exir_ops.edge.aten.add.Tensor: exir_ops.edge.cortex_m.quantized_add.default,
-        # Future ops to be added here:
+        # Future binary ops to be added here:
     }
 
     def __init__(self):
diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh
new file mode 100755
index 00000000000..cc28ac5484a
--- /dev/null
+++ b/backends/cortex_m/test/build_test_runner.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# TODO: More separation from the regular arm executor runner and testing.
+
+set -eu
+
+# Always rebuild executorch in case the cortex-m kernels has been updated.
+script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")")
+et_root_dir=$(realpath "${script_dir}/../../..")
+build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh"
+${build_executorch}
+
+# Build executor runner with all portable ops selected and semi hosting
+build_dir="${et_root_dir}/arm_test"
+build_executor_runner="${et_root_dir}/backends/arm/scripts/build_executor_runner.sh"
+build_root_test_dir="${et_root_dir}/arm_test/arm_semihosting_executor_runner_corstone-300"
+
+${build_executor_runner} --pte=semihosting --target=ethos-u55-128 --output="${build_root_test_dir}"
diff --git a/backends/cortex_m/test/ops/__init__.py b/backends/cortex_m/test/ops/__init__.py
new file mode 100644
index 00000000000..c8d1c683da3
--- /dev/null
+++ b/backends/cortex_m/test/ops/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/cortex_m/test/ops/test_add.py b/backends/cortex_m/test/ops/test_add.py
new file mode 100644
index 00000000000..b7b0ffcbfbc
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_add.py
@@ -0,0 +1,179 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+from executorch.backends.test.suite.operators.test_add import Model, ModelAlpha
+
+
+class CortexMSelfAdd(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def forward(self, x):
+        return x + x
+
+
+class CortexMScalarAdd(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMTensorAdd(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAlphaAdd(ModelAlpha):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_add_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+test_cases = {
+    "self_scalar": McuTestCase(
+        CortexMSelfAdd(),
+        (10.0,),
+    ),
+    "self_rank_1": McuTestCase(
+        CortexMSelfAdd(),
+        (torch.linspace(-5, 5, 10),),
+    ),
+    "self_rank_2_pos": McuTestCase(
+        CortexMSelfAdd(),
+        (ramp_tensor(0, 1000, (10, 1)),),
+    ),
+    "self_rank_3_neg": McuTestCase(
+        CortexMSelfAdd(),
+        (ramp_tensor(-100, 0, (2, 2, 2)),),
+    ),
+    "self_rank_4_small": McuTestCase(
+        CortexMSelfAdd(),
+        (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),),
+    ),
+    "self_rank_5": McuTestCase(
+        CortexMSelfAdd(),
+        (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),),
+    ),
+    "scalar_scalar": McuTestCase(
+        CortexMScalarAdd(),
+        (-0.5, 1.0),
+    ),
+    "tensor_scalar": McuTestCase(
+        CortexMScalarAdd(),
+        (torch.ones(2, 2), 1.0),
+    ),
+    "scalar_tensor": McuTestCase(
+        CortexMScalarAdd(),
+        (1000.0, torch.ones(2, 2)),
+    ),
+    "broadcast_1": McuTestCase(
+        CortexMTensorAdd(),
+        (torch.ones(1), torch.ones(2, 2, 2, 2)),
+    ),
+    "broadcast_2": McuTestCase(
+        CortexMTensorAdd(),
+        (torch.ones((2, 1, 1, 1)), torch.ones(1)),
+    ),
+    "broadcast_3": McuTestCase(
+        CortexMTensorAdd(),
+        (
+            ramp_tensor(-2, 2, (2, 1, 2, 1)),
+            ramp_tensor(-5, 5, (1, 2, 1, 2)),
+        ),
+    ),
+    "alpha": McuTestCase(
+        CortexMAlphaAdd(0.5),
+        (
+            ramp_tensor(-10, 10, (4, 5)),
+            ramp_tensor(-20, 20, (4, 5)),
+        ),
+    ),
+}
+
+
+dialect_xfails = {
+    "self_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError),
+    "self_rank_1": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_2_pos": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_3_neg": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_4_small": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_5": ("Output 0 does not match reference output", AssertionError),
+    "scalar_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError),
+    "broadcast_3": ("Output 0 does not match reference output", AssertionError),
+    "alpha": ("Expecting kwargs for aten op IR to be empty", AssertionError),
+}
+
+
+@parametrize("test_case", test_cases, xfails=dialect_xfails)
+def test_dialect_add(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+implementation_xfails = {
+    "self_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError),
+    "self_rank_1": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_2_pos": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_3_neg": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_4_small": ("Output 0 does not match reference output", AssertionError),
+    "self_rank_5": ("Output 0 does not match reference output", AssertionError),
+    "scalar_scalar": ("'float' object has no attribute 'fake_mode'", AttributeError),
+    "tensor_scalar": ("Output 0 does not match reference output", AssertionError),
+    "scalar_tensor": ("Output 0 does not match reference output", AssertionError),
+    "broadcast_1": ("Output 0 does not match reference output", AssertionError),
+    "broadcast_2": ("Output 0 does not match reference output", AssertionError),
+    "broadcast_3": ("Output 0 does not match reference output", AssertionError),
+    "alpha": ("Expecting kwargs for aten op IR to be empty", AssertionError),
+}
+
+
+@parametrize("test_case", test_cases, xfails=implementation_xfails)
+def test_implementation_add(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/ops/test_linear.py b/backends/cortex_m/test/ops/test_linear.py
new file mode 100644
index 00000000000..a1275352fcf
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_linear.py
@@ -0,0 +1,211 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+
+
+class CortexMMm(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.mm(x, y)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMBmm(torch.nn.Module):
+    def forward(self, x, y):
+        return torch.bmm(x, y)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_bmm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAddmm(torch.nn.Module):
+    def forward(self, x, y, z, alpha=None, beta=None):
+        return torch.addmm(beta, x, alpha, y, z)
+
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_addmm_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMAt(CortexMMm):
+    def forward(self, x, y):
+        return x @ y
+
+
+class CortexMMatmul(CortexMMm):
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+class CortexMLinear(CortexMMatmul):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.linear = torch.nn.Linear(*args, bias=False)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class CortexMLinearBias(CortexMAddmm):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.linear = torch.nn.Linear(*args, bias=True)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+test_cases = {
+    "mm": McuTestCase(
+        model=CortexMMm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "bmm": McuTestCase(
+        model=CortexMBmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16, 16)),
+            ramp_tensor(0, 10, (1, 16, 16)),
+        ),
+    ),
+    "addmm": McuTestCase(
+        model=CortexMAddmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            2,
+            4,
+        ),
+    ),
+    "addmm_scalars": McuTestCase(
+        model=CortexMAddmm(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "@-operator": McuTestCase(
+        model=CortexMAt(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "matmul": McuTestCase(
+        model=CortexMMatmul(),
+        example_inputs=(
+            ramp_tensor(0, 10, (1, 16)),
+            ramp_tensor(0, 10, (16, 16)),
+        ),
+    ),
+    "linear_rank1": McuTestCase(
+        model=CortexMLinear(2, 3),
+        example_inputs=(ramp_tensor(-1, 1, (2,)),),
+    ),
+    "linear_rank2_pos": McuTestCase(
+        model=CortexMLinear(8, 3),
+        example_inputs=(ramp_tensor(0, 10, (2, 8)),),
+    ),
+    "linear_rank3_neg": McuTestCase(
+        model=CortexMLinear(5, 3),
+        example_inputs=(ramp_tensor(-40, 0, (4, 2, 5)),),
+    ),
+    "linear_rank4": McuTestCase(
+        model=CortexMLinear(16, 32),
+        example_inputs=(ramp_tensor(-100, 100, (2, 1, 2, 16)),),
+    ),
+    "linear_rank5": McuTestCase(
+        model=CortexMLinear(4, 3),
+        example_inputs=(ramp_tensor(-2, 2, (5, 2, 1, 2, 4)),),
+    ),
+    "linear_bias": McuTestCase(
+        model=CortexMLinearBias(61, 37),
+        example_inputs=(ramp_tensor(0, 10, (8, 61)),),
+    ),
+}
+
+dialect_xfails = {
+    "mm": ("torch.mm ops are currently not quantized", RuntimeError),
+    "bmm": ("torch.bmm ops are currently not quantized", RuntimeError),
+    "addmm": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "matmul": ("torch.matmul ops are currently not quantized", RuntimeError),
+    "@-operator": ("@ ops are currently not quantized", RuntimeError),
+    "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank2_pos": ("name 'int32' is not defined", NameError),
+    "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_bias": ("name 'int32' is not defined", NameError),
+}
+
+
+@parametrize("test_case", test_cases, dialect_xfails)
+def test_dialect_linear(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+implementation_xfails = {
+    "mm": ("torch.mm ops are currently not quantized", RuntimeError),
+    "bmm": ("torch.bmm ops are currently not quantized", RuntimeError),
+    "addmm": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "addmm_scalars": ("torch.addmm ops are currently not quantized", RuntimeError),
+    "matmul": ("torch.matmul ops are currently not quantized", RuntimeError),
+    "@-operator": ("@ ops are currently not quantized", RuntimeError),
+    "linear_rank1": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank2_pos": ("Output 0 does not match reference output.", AssertionError),
+    "linear_rank3_neg": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank4": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_rank5": ("Only rank 2 linear ops are fused currently", RuntimeError),
+    "linear_bias": ("Output 0 does not match reference output.", AssertionError),
+}
+
+
+@parametrize("test_case", test_cases, implementation_xfails)
+def test_implementation_linear(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/ops/test_mul.py b/backends/cortex_m/test/ops/test_mul.py
new file mode 100644
index 00000000000..a2f13760bf0
--- /dev/null
+++ b/backends/cortex_m/test/ops/test_mul.py
@@ -0,0 +1,131 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import pytest
+import torch
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.test.tester import (
+    CortexMTester,
+    McuTestCase,
+    ramp_tensor,
+)
+from executorch.backends.test.suite.operators.test_mul import Model
+
+
+class CortexMSelfMul(torch.nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def forward(self, x):
+        return x * x
+
+
+class CortexMScalarMul(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+class CortexMTensorMul(Model):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 3,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_mul_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+
+test_cases = {
+    "self_scalar": McuTestCase(
+        CortexMSelfMul(),
+        (10.0,),
+    ),
+    "self_rank_1": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-5, 5, (10,)),),
+    ),
+    "self_rank_2_pos": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(0, 1000, (10, 1)),),
+    ),
+    "self_rank_3_neg": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-100, 0, (2, 2, 2)),),
+    ),
+    "self_rank_4_small": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-0.1, 0.1, (2, 2, 2, 2)),),
+    ),
+    "self_rank_5": McuTestCase(
+        CortexMSelfMul(),
+        (ramp_tensor(-5, 5, (2, 2, 2, 2, 2)),),
+    ),
+    "scalar_scalar": McuTestCase(
+        CortexMScalarMul(),
+        (-0.5, 1.0),
+    ),
+    "tensor_scalar": McuTestCase(
+        CortexMScalarMul(),
+        (torch.ones(2, 2), 1.0),
+    ),
+    "scalar_tensor": McuTestCase(
+        CortexMScalarMul(),
+        (1000.0, torch.ones(2, 2)),
+    ),
+    "broadcast_1": McuTestCase(
+        CortexMTensorMul(),
+        (torch.ones(1), torch.ones(2, 2, 2, 2)),
+    ),
+    "broadcast_2": McuTestCase(
+        CortexMTensorMul(),
+        (torch.ones((2, 1, 1, 1)), torch.ones(1)),
+    ),
+    "broadcast_3": McuTestCase(
+        CortexMTensorMul(),
+        (
+            ramp_tensor(-2, 2, (2, 1, 2, 1)),
+            ramp_tensor(-5, 5, (1, 2, 1, 2)),
+        ),
+    ),
+}
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+@parametrize("test_case", test_cases)
+def test_dialect_mul(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_dialect(
+        test_case.model.ops_before_transforms, test_case.model.ops_after_transforms
+    )
+
+
+@pytest.mark.skip(reason="Not implemented yet")
+@parametrize("test_case", test_cases)
+def test_implementation_mul(test_case):
+    tester = CortexMTester(test_case.model, test_case.example_inputs)
+    tester.test_implementation()
diff --git a/backends/cortex_m/test/test_quantize_op_fusion_pass.py b/backends/cortex_m/test/test_quantize_op_fusion_pass.py
index 1595b0cfbc3..20f2ecfe656 100644
--- a/backends/cortex_m/test/test_quantize_op_fusion_pass.py
+++ b/backends/cortex_m/test/test_quantize_op_fusion_pass.py
@@ -313,7 +313,7 @@ def forward(self, x, y):
         # Apply passes
         transformed_program = self._apply_passes(edge_program)
 
-        # Generate ExecutorTorch program
+        # Generate ExecuTorch program
         executorch_program = transformed_program.to_executorch()
 
         # Verify the program contains the expected fused operator
diff --git a/backends/cortex_m/test/tester.py b/backends/cortex_m/test/tester.py
new file mode 100644
index 00000000000..c492d3c8443
--- /dev/null
+++ b/backends/cortex_m/test/tester.py
@@ -0,0 +1,114 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+from executorch.backends.arm.test.common import get_u55_compile_spec
+from executorch.backends.arm.test.tester.arm_tester import Serialize
+from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
+    QuantizedLinearFusionPass,
+)
+from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
+    QuantizedOpFusionPass,
+)
+
+from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
+    ReplaceQuantNodesPass,
+)
+from executorch.backends.test.harness import Tester as TesterBase
+from executorch.backends.test.harness.stages import (
+    Export,
+    Quantize,
+    RunPasses,
+    StageType,
+    ToEdgeTransformAndLower,
+    ToExecutorch,
+)
+from executorch.backends.xnnpack._passes import XNNPACKPassManager
+
+from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
+    get_symmetric_quantization_config,
+    XNNPACKQuantizer,
+)
+
+
+class CortexMQuantize(Quantize):
+    def __init__(self):
+        quantizer = XNNPACKQuantizer()
+        config = get_symmetric_quantization_config()
+        super().__init__(quantizer, config)
+
+
+class CortexMRunPasses(RunPasses):
+    def __init__(self):
+        super().__init__(
+            XNNPACKPassManager,
+            pass_list=[
+                ReplaceQuantNodesPass,
+                QuantizedLinearFusionPass,
+                QuantizedOpFusionPass,
+            ],
+        )
+
+
+class CortexMSerialize(Serialize):
+    def __init__(self):
+        compile_spec = get_u55_compile_spec()
+        super().__init__(compile_spec, 1024)
+
+
+cortex_m_stage_classes = {
+    StageType.EXPORT: Export,
+    StageType.QUANTIZE: CortexMQuantize,
+    StageType.RUN_PASSES: CortexMRunPasses,
+    StageType.SERIALIZE: Serialize,
+    StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
+    StageType.TO_EXECUTORCH: ToExecutorch,
+    StageType.SERIALIZE: CortexMSerialize,
+}
+
+
+class CortexMTester(TesterBase):
+    def __init__(self, module, example_inputs):
+        super().__init__(module, example_inputs, cortex_m_stage_classes)
+
+    def test_dialect(self, ops_before_transforms, ops_after_transforms, qtol=0):
+        """
+        Test the python dialect op implementation.
+        """
+        self.quantize()
+        self.export()
+        self.to_edge_transform_and_lower()
+        self.check_count(ops_before_transforms)
+        self.run_passes()
+        self.check_count(ops_after_transforms)
+        self.run_method_and_compare_outputs(inputs=self.example_inputs, qtol=qtol)
+
+    def test_implementation(self, qtol=0):
+        """
+        Test the optimized op implementation in simulation
+        """
+        self.quantize()
+        self.export()
+        self.to_edge_transform_and_lower()
+        self.run_passes()
+        self.to_executorch()
+        self.serialize()
+        self.run_method_and_compare_outputs(inputs=self.example_inputs, qtol=qtol)
+
+
+@dataclass
+class McuTestCase:
+    model: torch.nn.Module
+    example_inputs: tuple[Any]
+
+
+def ramp_tensor(start: int, end: int, shape: tuple[int]) -> torch.Tensor:
+    return torch.linspace(start, end, steps=torch.prod(torch.tensor(shape))).reshape(
+        shape
+    )
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..221291442ec
--- /dev/null
+++ b/backends/cuda/CMakeLists.txt
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+cmake_minimum_required(VERSION 3.29)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp runtime/guard.cpp
+    runtime/shims/cuda_guard.cpp
+)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+         $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(
+  aoti_cuda PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+                   $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+# Ensure symbols are exported properly
+target_link_options(
+  aoti_cuda PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
+)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda PUBLIC aoti_common CUDA::cudart ${CMAKE_DL_LIBS}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+if(BUILD_TESTING)
+  # Add runtime
+  add_executable(voxtral_runner tests/voxtral_runner.cpp)
+  target_link_libraries(
+    voxtral_runner PUBLIC aoti_cuda extension_module_static
+                          extension_flat_tensor portable_ops_lib
+  )
+endif()
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/cuda/TARGETS b/backends/cuda/TARGETS
new file mode 100644
index 00000000000..fe57f7f1b63
--- /dev/null
+++ b/backends/cuda/TARGETS
@@ -0,0 +1,35 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_library(
+    name = "cuda_backend",
+    srcs = [
+        "cuda_backend.py",
+        "replace_slice_copy_with_slice.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir/_serialize:lib",
+        "//executorch/exir/backend:backend_details",
+        "//executorch/exir/backend:compile_spec_schema",
+    ],
+)
+
+runtime.python_library(
+    name = "cuda_partitioner",
+    srcs = [
+        "cuda_partitioner.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir/backend:partitioner",
+        "//executorch/exir/backend:utils",
+    ],
+)
diff --git a/backends/cuda/__init__.py b/backends/cuda/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/backends/cuda/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
new file mode 100644
index 00000000000..ef98de29f23
--- /dev/null
+++ b/backends/cuda/cuda_backend.py
@@ -0,0 +1,207 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import os
+import typing
+from enum import Enum
+
+from typing import Any, Dict, final, List, Optional, Set
+
+import torch
+from executorch.backends.cuda.replace_slice_copy_with_slice import (
+    ReplaceSliceCopyWithSlicePass,
+)
+from executorch.exir._serialize._named_data_store import NamedDataStore
+from executorch.exir._warnings import experimental
+from executorch.exir.backend.backend_details import (
+    BackendDetails,
+    ExportedProgram,
+    PreprocessResult,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch.export.passes import move_to_device_pass
+from torch.nn.attention import SDPBackend
+
+# exist fallback operators in et namespace;
+supported_fallback_kernels: Dict[str, Any] = {}
+
+# required fallback kernels but not supported
+missing_fallback_kernels: Set[str] = set()
+
+
+class COMPILE_SPEC_KEYS(Enum):
+    METHOD_NAME = "method_name"
+
+
+# context manager for non-fallback guarantee
+# it will raise exception when generating fallback kernels during aoti compile
+@contextlib.contextmanager
+def collect_unsupported_fallback_kernels():
+    original_generate_c_shim_extern_kernel_call = (
+        CppWrapperCpu.generate_c_shim_extern_kernel_call
+    )
+    original_generate_fallback_kernel_with_runtime_lookup_aot = (
+        CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot
+    )
+
+    def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels(
+        self,
+        kernel: str,
+        args: list[str],
+        device: str,
+        *,
+        debug_args: Optional[list[str]] = None,
+    ):
+        if kernel not in supported_fallback_kernels:
+            missing_fallback_kernels.add(kernel)
+
+        original_generate_c_shim_extern_kernel_call(
+            self, kernel, args, device, debug_args=debug_args
+        )
+
+    def generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels(
+        self,
+        op_overload,
+        raw_args,
+        output_args,
+        raw_outputs,
+    ):
+        # Extract kernel name for collection
+        kernel_name = getattr(op_overload, "_name", str(op_overload))
+        if kernel_name not in supported_fallback_kernels:
+            missing_fallback_kernels.add(kernel_name)
+
+        original_generate_fallback_kernel_with_runtime_lookup_aot(
+            self, op_overload, raw_args, output_args, raw_outputs
+        )
+
+    CppWrapperCpu.generate_c_shim_extern_kernel_call = (
+        generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels
+    )
+    CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = (
+        generate_fallback_kernel_with_runtime_lookup_aot_and_collect_unsupported_kernels
+    )
+    try:
+        yield
+    finally:
+        CppWrapperCpu.generate_c_shim_extern_kernel_call = (
+            original_generate_c_shim_extern_kernel_call
+        )
+        CppWrapperCpu.generate_fallback_kernel_with_runtime_lookup_aot = (
+            original_generate_fallback_kernel_with_runtime_lookup_aot
+        )
+
+
+@final
+@experimental(
+    "This API and all of cuda backend related functionality are experimental."
+)
+class CudaBackend(BackendDetails):
+    """
+    CudaBackend is a backend that compiles a model to run on CUDA devices. It uses the AOTInductor compiler to generate
+    optimized CUDA kernels for the model's operators with libtorch-free. The compiled model can be executed on CUDA devices
+    using the Executorch runtime.
+    """
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        # Move the edge_program from CPU to CUDA for aoti compile
+        cuda_edge_program = move_to_device_pass(edge_program, "cuda")
+
+        # replace slice_copy with slice
+        ReplaceSliceCopyWithSlicePass()(cuda_edge_program.graph_module)
+
+        edge_program_module = cuda_edge_program.module()
+
+        # Grab all input placeholders from the graph
+        user_input_names = cuda_edge_program.graph_signature.user_inputs
+        user_input_placeholders = []
+        for node in cuda_edge_program.graph.nodes:
+            if node.op == "placeholder" and node.name in user_input_names:
+                user_input_placeholders.append(node.meta["val"])
+
+        options: dict[str, typing.Any] = {
+            # Better model precision
+            "emulate_precision_casts": True,
+            # Embed CUDA kernel binaries directly into the compiled shared object
+            "aot_inductor.embed_kernel_binary": True,
+            # Do not link against the full PyTorch/libtorch library
+            "aot_inductor.link_libtorch": False,
+            # Package model constants and other generated files directly in the shared object (.so) file
+            "aot_inductor.package_constants_in_so": True,
+            # Enable maximum automatic tuning for optimal performance
+            "max_autotune": True,
+            # Use TRITON for GEMM (General Matrix Multiply) operations tuning only to avoid using operators in libtorch
+            "max_autotune_gemm_backends": "TRITON",
+            # Use TRITON backend for convolution operations tuning only to avoid using operators in libtorch
+            "max_autotune_conv_backends": "TRITON",
+        }
+
+        with collect_unsupported_fallback_kernels(), torch.nn.attention.sdpa_kernel(
+            [
+                SDPBackend.MATH  # pyre-ignore[16]: Module `torch.nn.attention` has no attribute `SDPBackend`.
+            ]
+        ), torch.no_grad():
+            # torch._logging.set_logs(post_grad_graphs=True)
+            so_path = torch._inductor.aot_compile(edge_program_module, tuple(user_input_placeholders), options=options)  # type: ignore[arg-type]
+            if len(missing_fallback_kernels) > 0:
+                formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
+                raise RuntimeError(
+                    f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n  - {formatted_kernels}\n"
+                    "Please add them to the AOTI backend."
+                )
+
+        # pyre-ignorep[6]: Incompatible parameter type
+        with open(so_path, "rb") as f:
+            so_data = f.read()
+
+        named_data_store = NamedDataStore()
+        method_name = CudaBackend.method_name_from_compile_specs(compile_specs)
+        named_data_store.add_named_data(
+            method_name + "_so_blob", so_data, 1, "aoti_cuda_blob"
+        )
+
+        # Clean up the generated so file; it has been packaged into the NamdeDataStore
+        # pyre-ignorep[6]: Incompatible parameter type
+        os.remove(so_path)
+
+        return PreprocessResult(
+            processed_bytes=b"",
+            debug_handle_map={},
+            data_store_output=named_data_store.get_named_data_store_output(),
+        )
+
+    @staticmethod
+    def generate_method_name_compile_spec(
+        method_name: str,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model compute precision, for additional details
+        please refer to the documentation for ``coremltools.precision``.
+        """
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.METHOD_NAME.value,
+            method_name.encode("utf-8"),
+        )
+
+    @staticmethod
+    def method_name_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> str:
+        """
+        Returns the method name from the compile specs.
+        """
+        for spec in compile_specs:
+            if spec.key == COMPILE_SPEC_KEYS.METHOD_NAME.value:
+                return spec.value.decode("utf-8")
+        raise RuntimeError(
+            f"Could not find method name in compile specs: {compile_specs}"
+        )
diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
new file mode 100644
index 00000000000..64df7b7dcb2
--- /dev/null
+++ b/backends/cuda/cuda_partitioner.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Dict, final, List, Optional, Tuple
+
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend  # usort: skip
+from executorch.exir._warnings import experimental
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from torch.export.exported_program import ExportedProgram
+
+
+@final
+@experimental(
+    "This API and all of cuda backend related functionality are experimental."
+)
+class CudaPartitioner(Partitioner):
+    """
+    CUDA partitioner for AOTInductor backend integration.
+
+    This partitioner creates a single partition containing all operators from the input graph.
+    It skips core ATen decomposition, allowing the CUDA backend to handle decomposition using
+    AOTInductor's CUDA-specific decomposition table.
+
+    Only operators that cannot be handled by the aoti-cuda library will be excluded from
+    the partition and fall back to ExecuTorch's default or custom handling.
+    """
+
+    def __init__(self, compile_spec: List[CompileSpec]) -> None:
+        self.delegation_spec = DelegationSpec(CudaBackend.__name__, compile_spec)
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        """
+        Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
+        """
+
+        partition_tags: Dict[str, DelegationSpec] = {}
+        tag = "tag0"
+
+        for node in exported_program.graph.nodes:
+            if node.op != "call_function":
+                continue
+            node.meta["delegation_tag"] = tag
+
+        partition_tags[tag] = self.delegation_spec
+
+        tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
+
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        """
+        Return a list of operations that should not be decomposed and let the AOT compiler handle them.
+        Currently we skip ATen decompositon for all ops, and let the cuda backend handle them.
+        """
+        do_not_decompose = set()
+
+        for node in ep.graph.nodes:
+            if node.op == "call_function" and isinstance(
+                node.target, torch._ops.OpOverload
+            ):
+                do_not_decompose.add(node.target)
+        return list(do_not_decompose), None
diff --git a/backends/cuda/replace_slice_copy_with_slice.py b/backends/cuda/replace_slice_copy_with_slice.py
new file mode 100644
index 00000000000..4f16759af35
--- /dev/null
+++ b/backends/cuda/replace_slice_copy_with_slice.py
@@ -0,0 +1,118 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Dict, Iterable, Tuple
+
+import torch
+from executorch.exir.dialects._ops import ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch import fx
+
+
+_SLICE_COPY_TARGETS: Tuple[torch._ops.OpOverload | EdgeOpOverload] = (
+    torch.ops.aten.slice_copy.Tensor,
+    ops.edge.aten.slice_copy.Tensor,
+)
+
+_SLICE_TARGETS: Dict[
+    torch._ops.OpOverload | EdgeOpOverload, torch._ops.OpOverload | EdgeOpOverload
+] = {
+    torch.ops.aten.slice_copy.Tensor: torch.ops.aten.slice.Tensor,
+    ops.edge.aten.slice_copy.Tensor: ops.edge.aten.slice.Tensor,
+}
+
+
+class ReplaceSliceCopyWithSlicePass(ExportPass):
+    """Replace non-mutated ``slice_copy`` results with ``slice`` views."""
+
+    def call(self, graph_module: fx.GraphModule) -> PassResult:
+        graph_changed = False
+
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in _SLICE_COPY_TARGETS:
+                continue
+
+            if self._has_blocking_user(node, node.users.keys()):
+                continue
+
+            node.target = _SLICE_TARGETS[node.target]
+            graph_changed = True
+
+        if graph_changed:
+            graph_module.graph.lint()
+            graph_module.recompile()
+
+        return PassResult(graph_module, graph_changed)
+
+    def _has_blocking_user(self, node: fx.Node, users: Iterable[fx.Node]) -> bool:
+        for user in users:
+            if self._is_mutating_user(node, user) or self._is_view_user(node, user):
+                return True
+        return False
+
+    def _is_mutating_user(self, node: fx.Node, user: fx.Node) -> bool:
+        if user.op == "call_method":
+            # Treat in-place tensor methods conservatively as mutations only when the
+            # method name ends with ``_`` which is the PyTorch convention for mutation.
+            return isinstance(user.target, str) and user.target.endswith("_")
+
+        if user.op != "call_function":
+            return False
+
+        target = user.target
+        if not hasattr(target, "_schema"):
+            return False
+
+        schema = target._schema  # pyre-ignore[16]
+        # Positional arguments
+        for index, arg in enumerate(user.args):
+            if arg is node and self._argument_mutates(schema, index):
+                return True
+
+        # Keyword arguments
+        for name, arg in user.kwargs.items():
+            if arg is node and self._argument_mutates(schema, name):
+                return True
+
+        return False
+
+    def _is_view_user(self, node: fx.Node, user: fx.Node) -> bool:
+        if user.op == "call_method":
+            # Treat tensor methods conservatively and assume they may be view-producing.
+            return True
+
+        if user.op != "call_function":
+            return False
+
+        target = user.target
+        if getattr(target, "is_view", False):
+            for arg in user.args:
+                if arg is node:
+                    return True
+            for arg in user.kwargs.values():
+                if arg is node:
+                    return True
+
+        return False
+
+    def _argument_mutates(
+        self, schema: torch._C.FunctionSchema, key: int | str
+    ) -> bool:
+        arguments = schema.arguments
+        if isinstance(key, int):
+            if key >= len(arguments):
+                return False
+            argument = arguments[key]
+        else:
+            argument = next((arg for arg in arguments if arg.name == key), None)
+            if argument is None:
+                return False
+
+        alias_info = argument.alias_info
+        return bool(alias_info and alias_info.is_write)
diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
new file mode 100644
index 00000000000..54412269287
--- /dev/null
+++ b/backends/cuda/runtime/TARGETS
@@ -0,0 +1,58 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.cxx_library(
+    name = "runtime_shims",
+    srcs = [
+        "guard.cpp",
+        "shims/cuda_guard.cpp",
+        "shims/memory.cpp",
+        "shims/tensor_attribute.cpp",
+    ],
+    headers = [
+        "guard.h",
+        "shims/cuda_guard.h",
+        "shims/memory.h",
+        "shims/tensor_attribute.h",
+        "utils.h",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    # Constructor needed for backend registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        "//executorch/backends/aoti:common_shims",
+        "//executorch/extension/tensor:tensor",
+        "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/platform:platform",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
+
+runtime.cxx_library(
+    name = "cuda_backend",
+    srcs = [
+        "cuda_backend.cpp",
+    ],
+    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+    link_whole = True,
+    supports_python_dlopen = True,
+    # Constructor needed for backend registration.
+    compiler_flags = ["-Wno-global-constructors"],
+    visibility = ["@EXECUTORCH_CLIENTS"],
+    deps = [
+        ":runtime_shims",
+        "//executorch/backends/aoti:aoti_common",
+        "//executorch/runtime/backend:interface",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
+    ],
+    external_deps = [
+        ("cuda", None, "cuda-lazy"),
+    ],
+)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
new file mode 100644
index 00000000000..805c54ff55c
--- /dev/null
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <dlfcn.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <unistd.h>
+#include <cstdio>
+
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+
+// Include our shim layer headers
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+
+namespace executorch::backends::cuda {
+
+#define LOAD_SYMBOL(name, handle)                                \
+  do {                                                           \
+    name = reinterpret_cast<name##Func>(dlsym(handle, #name));   \
+    ET_CHECK_OR_RETURN_ERROR(                                    \
+        name != nullptr, AccessFailed, "Failed to load " #name); \
+  } while (0)
+
+using namespace std;
+using namespace aoti;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class ET_EXPERIMENTAL CudaBackend final
+    : public ::executorch::runtime::BackendInterface {
+ private:
+  Error register_shared_library_functions(void* so_handle) const {
+    LOAD_SYMBOL(AOTInductorModelContainerCreateWithDevice, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerDelete, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumInputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerGetNumOutputs, so_handle);
+    LOAD_SYMBOL(AOTInductorModelContainerRun, so_handle);
+
+    return Error::Ok;
+  }
+
+ public:
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be a empty buffer
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    std::string method_name;
+    for (const CompileSpec& spec : compile_specs) {
+      if (std::strcmp(spec.key, "method_name") == 0) {
+        method_name.assign(
+            static_cast<const char*>(spec.value.buffer),
+            spec.value.nbytes); // no nullptr guarantee, so pass size
+        break;
+      }
+    }
+
+    std::string so_blob_key =
+        method_name.empty() ? "so_blob" : method_name + "_so_blob";
+
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+    auto aoti_cuda_buffer = named_data_map->get_data(so_blob_key.c_str());
+    ET_CHECK_OR_RETURN_ERROR(
+        aoti_cuda_buffer.ok(),
+        Internal,
+        "Failed to get data for key %s: 0x%x",
+        so_blob_key.c_str(),
+        static_cast<uint32_t>(aoti_cuda_buffer.error()));
+
+    // Generate dynamic temporary file path
+    filesystem::path temp_dir = filesystem::temp_directory_path();
+    filesystem::path so_path =
+        temp_dir / (so_blob_key + to_string(getpid()) + ".so");
+
+    // Create a temporary file
+    ofstream outfile(so_path.c_str(), ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    ET_LOG(
+        Info,
+        "Writing %zu bytes to %s",
+        aoti_cuda_buffer->size(),
+        so_path.c_str());
+
+    outfile.write(
+        static_cast<const char*>(aoti_cuda_buffer->data()),
+        aoti_cuda_buffer->size());
+
+    ET_CHECK_OR_RETURN_ERROR(
+        outfile, AccessFailed, "Failed to write to file %s", so_path.c_str());
+
+    // Finish writing the file to disk
+    outfile.close();
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    ET_CHECK_OR_RETURN_ERROR(
+        so_handle != nullptr,
+        AccessFailed,
+        "Failed to load shared library: %s",
+        dlerror());
+
+    processed->Free();
+
+    // Register all shared library functions
+    ET_CHECK_OK_OR_RETURN_ERROR(register_shared_library_functions(so_handle));
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    ET_CHECK_OK_OR_RETURN_ERROR(AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr));
+
+    ET_LOG(Info, "container_handle = %p", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->so_path = so_path.string();
+    handle->container_handle = container_handle;
+
+    // Create a CUDA stream for asynchronous execution
+    cudaStream_t cuda_stream;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
+    handle->cuda_stream = static_cast<void*>(cuda_stream);
+
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Need to re-register all the symbols from the so_handle hosted by this
+    // CudaBackend instance. The reason is that these symbols are
+    // static/singleton across the whole process. When we share multiple methods
+    // (meaning multiple so_handle) in the same process, we need to re-register
+    // the symbols from the so_handle that is being used in this execution.
+    ET_CHECK_OK_OR_RETURN_ERROR(
+        register_shared_library_functions(handle->so_handle));
+
+    size_t n_inputs;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        n_inputs + n_outputs == args.size(),
+        InvalidArgument,
+        "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+        n_inputs,
+        n_outputs,
+        args.size())
+
+    // NOTE: ExecuTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for CUDA kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
+
+    // Process input tensors: ExecuTorch provides CPU tensors, create GPU
+    // copies
+    for (int i = 0; i < n_inputs; i++) {
+      // Get tensor dimensions and properties from ExecuTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for input %d",
+          i);
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Copy data from CPU to GPU
+      ET_CHECK_OR_RETURN_ERROR(
+          aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
+          Internal,
+          "Failed to copy input %d from CPU to GPU",
+          i);
+    }
+    // Process output tensors: create GPU counterparts for ExecuTorch CPU
+    // tensors
+    for (int i = 0; i < n_outputs; i++) {
+      // Get output tensor dimensions from ExecuTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      ET_CHECK_OR_RETURN_ERROR(
+          create_err == Error::Ok,
+          Internal,
+          "Failed to create GPU tensor for output %d",
+          i);
+
+      gpu_outputs[i] = gpu_output_handle;
+    }
+    // Run AOTI container with GPU tensors
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        gpu_inputs.data(), // Use GPU input tensors
+        n_inputs,
+        gpu_outputs.data(), // Use GPU output tensors
+        n_outputs,
+        handle->cuda_stream, // Pass the actual CUDA stream
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    ET_CHECK_OR_RETURN_ERROR(
+        error == Error::Ok,
+        Internal,
+        "AOTInductorModelContainerRun failed with error code %d",
+        error);
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      // For DYNAMIC_BOUND tensors we try to resize
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
+          "Error resizing tensor at output index %d",
+          i);
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
+          "Failed to copy GPU output %d back to CPU",
+          i);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    if (handle_ == nullptr) {
+      return;
+    }
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    // Destroy the CUDA stream if it exists
+    if (handle->cuda_stream != nullptr) {
+      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
+      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
+      ET_CHECK_OR_LOG_ERROR(
+          stream_err == cudaSuccess,
+          "Failed to destroy CUDA stream: %s",
+          cudaGetErrorString(stream_err));
+      handle->cuda_stream = nullptr;
+    }
+
+    // NOTE: AOTInductorModelContainerDelete does not work correctly with
+    // multiple .so files. Deleting one container frees shared resources,
+    // which causes segmentation faults when attempting to delete other
+    // containers. As a workaround, we skip explicit container deletion
+    // and defer cleanup to the OS.
+    // TODO(gasoonjia): Find a proper solution for safe container deletion.
+    // AOTInductorModelContainerDelete(handle->container_handle);
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
+    // Remove the temporary shared library file
+    if (!handle->so_path.empty()) {
+      std::error_code remove_error;
+      std::filesystem::remove(handle->so_path, remove_error);
+      ET_CHECK_OR_LOG_ERROR(
+          !remove_error,
+          "Failed to remove temporary shared library %s: %s",
+          handle->so_path.c_str(),
+          remove_error.message().c_str());
+    }
+
+    delete handle;
+    clear_all_tensors();
+  }
+};
+
+} // namespace executorch::backends::cuda
+
+namespace executorch::backends {
+namespace {
+auto cls = cuda::CudaBackend();
+executorch::runtime::Backend backend{"CudaBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+} // namespace executorch::backends
diff --git a/backends/cuda/runtime/guard.cpp b/backends/cuda/runtime/guard.cpp
new file mode 100644
index 00000000000..674cc6387b3
--- /dev/null
+++ b/backends/cuda/runtime/guard.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::backends::cuda {
+
+namespace {
+// Thread-local stream storage (private to this file)
+thread_local std::unordered_map<DeviceIndex, cudaStream_t> current_streams_;
+} // namespace
+
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index) {
+  if (device_index == -1) {
+    // Get current device if not specified
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  current_streams_[device_index] = stream;
+  return Error::Ok;
+}
+
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index) {
+  if (device_index == -1) {
+    int current_device;
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&current_device));
+    device_index = current_device;
+  }
+
+  auto it = current_streams_.find(device_index);
+  if (it != current_streams_.end()) {
+    return it->second;
+  }
+
+  cudaStream_t stream;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&stream));
+  setCurrentCUDAStream(stream, device_index);
+  return stream;
+}
+
+CUDAGuard::CUDAGuard(CUDAGuard&& other) noexcept
+    : original_device_index_(other.original_device_index_),
+      current_device_index_(other.current_device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the device
+  other.original_device_index_ = other.current_device_index_;
+}
+
+CUDAGuard::~CUDAGuard() {
+  if (original_device_index_ != current_device_index_) {
+    cudaError_t err = cudaSetDevice(original_device_index_);
+    if (err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "~CUDAGuard: Failed to restore device to %d: %s",
+          original_device_index_,
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+Error CUDAGuard::set_index(DeviceIndex device_index) {
+  int orig_index = -1;
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetDevice(&orig_index));
+
+  original_device_index_ = orig_index;
+  current_device_index_ = device_index;
+
+  if (current_device_index_ != original_device_index_) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaSetDevice(current_device_index_));
+  }
+
+  return Error::Ok;
+}
+
+Result<CUDAGuard> CUDAGuard::create(DeviceIndex device_index) {
+  CUDAGuard guard; // Fixed: Removed () to create a variable, not a function
+  ET_CHECK_OK_OR_RETURN_ERROR(guard.set_index(device_index));
+  return guard;
+}
+
+CUDAStreamGuard::CUDAStreamGuard(CUDAStreamGuard&& other) noexcept
+    : device_guard_(std::move(other.device_guard_)),
+      original_stream_(other.original_stream_),
+      current_stream_(other.current_stream_),
+      device_index_(other.device_index_) {
+  // Mark the moved-from object as "already restored" so its destructor doesn't
+  // try to restore the stream
+  other.original_stream_ = other.current_stream_;
+}
+
+CUDAStreamGuard::~CUDAStreamGuard() {
+  // Restore the original stream unless this object was moved-from.
+  // After a move, original_stream_ == current_stream_, which indicates
+  // the moved-from object should not restore.
+  // Note: nullptr is a valid stream value (represents the default stream),
+  // so we must restore even if original_stream_ is nullptr.
+  if (original_stream_ != current_stream_) {
+    Error err = setCurrentCUDAStream(original_stream_, device_index_);
+    if (err != Error::Ok) {
+      ET_LOG(
+          Error,
+          "~CUDAStreamGuard: Failed to restore stream for device %d",
+          device_index_);
+    }
+  }
+}
+
+Error CUDAStreamGuard::set_stream(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to get current stream for device %d", device_index);
+    return result.error();
+  }
+
+  original_stream_ = result.get();
+  current_stream_ = stream;
+  device_index_ = device_index;
+
+  ET_CHECK_OK_OR_RETURN_ERROR(setCurrentCUDAStream(stream, device_index));
+
+  return Error::Ok;
+}
+
+Result<CUDAStreamGuard> CUDAStreamGuard::create(
+    cudaStream_t stream,
+    DeviceIndex device_index) {
+  auto guard_result = CUDAGuard::create(device_index);
+  ET_CHECK_OK_OR_RETURN_ERROR(guard_result.error());
+
+  CUDAStreamGuard stream_guard(std::move(guard_result.get()));
+  ET_CHECK_OK_OR_RETURN_ERROR(stream_guard.set_stream(stream, device_index));
+
+  return stream_guard;
+}
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/guard.h b/backends/cuda/runtime/guard.h
new file mode 100644
index 00000000000..3f187000f90
--- /dev/null
+++ b/backends/cuda/runtime/guard.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+// Type alias for device index
+using DeviceIndex = int32_t;
+
+/**
+ * Set the current CUDA stream for the specified device.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index (-1 to use current device)
+ * @return Error code indicating success or failure
+ */
+Error setCurrentCUDAStream(cudaStream_t stream, DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream for the specified device.
+ * If no stream has been set, creates a new stream and sets it as current.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @return Result containing the current stream on success, or an error code on
+ * failure
+ */
+Result<cudaStream_t> getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * RAII guard that sets the current CUDA device and restores it on destruction.
+ * This ensures that the device is properly restored even if an exception
+ * occurs.
+ *
+ */
+class CUDAGuard {
+ private:
+  /**
+   * Private constructor - use create() factory method instead.
+   */
+  explicit CUDAGuard()
+      : original_device_index_(-1), current_device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAGuard.
+   *
+   * @param device_index The device index to set as current
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAGuard> create(DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAGuard(CUDAGuard&& other) noexcept;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /**
+   * Destructor that restores the original device if necessary.
+   */
+  ~CUDAGuard();
+
+  /**
+   * Sets the CUDA device to the given device index.
+   *
+   * @param device_index The device index to set as current
+   * @return Error code indicating success or failure
+   */
+  Error set_index(DeviceIndex device_index);
+
+  /**
+   * Get the original device index before the guard was created.
+   *
+   * @return The original device index
+   */
+  DeviceIndex original_device() const {
+    return original_device_index_;
+  }
+
+  /**
+   * Get the current device index.
+   *
+   * @return The current device index
+   */
+  DeviceIndex current_device() const {
+    return current_device_index_;
+  }
+
+ private:
+  /// The original device before this guard was created
+  DeviceIndex original_device_index_;
+  /// The current device managed by this guard
+  DeviceIndex current_device_index_;
+};
+
+/**
+ * RAII guard that sets the current CUDA device and stream, restoring both on
+ * destruction. This is useful for temporarily switching to a different device
+ * and stream.
+ *
+ */
+class CUDAStreamGuard {
+ private:
+  // Private constructor that takes a CUDAGuard
+  explicit CUDAStreamGuard(CUDAGuard&& guard)
+      : device_guard_(std::move(guard)),
+        original_stream_(nullptr),
+        current_stream_(nullptr),
+        device_index_(-1) {}
+
+ public:
+  /**
+   * Factory method to create a CUDAStreamGuard.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Result containing the guard on success, or an error code on failure
+   */
+  static Result<CUDAStreamGuard> create(
+      cudaStream_t stream,
+      DeviceIndex device_index);
+
+  // Copy is not allowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  // Move constructor and assignment
+  CUDAStreamGuard(CUDAStreamGuard&& other) noexcept;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) noexcept = delete;
+
+  /**
+   * Destructor that restores the original stream and device.
+   */
+  ~CUDAStreamGuard();
+
+  /**
+   * Sets the CUDA stream to the given stream on the specified device.
+   *
+   * @param stream The CUDA stream to set as current
+   * @param device_index The device index for the stream
+   * @return Error code indicating success or failure
+   */
+  Error set_stream(cudaStream_t stream, DeviceIndex device_index);
+
+  /**
+   * Get the current guarded stream.
+   *
+   * @return The current stream
+   */
+  cudaStream_t stream() const {
+    return current_stream_;
+  }
+
+  /**
+   * Get the device index being guarded.
+   *
+   * @return The device index
+   */
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+ private:
+  /// The device guard that handles device switching
+  CUDAGuard device_guard_;
+  /// The original stream that was current before this guard
+  cudaStream_t original_stream_ = nullptr;
+  /// The current stream being guarded
+  cudaStream_t current_stream_ = nullptr;
+  /// The device index for this stream guard
+  DeviceIndex device_index_;
+};
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.cpp b/backends/cuda/runtime/shims/cuda_guard.cpp
new file mode 100644
index 00000000000..bb07acc7ffa
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+
+namespace executorch::backends::cuda {
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_guard failed: ret_guard is null");
+
+  auto result = CUDAGuard::create(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_guard_set_index failed: guard is null");
+
+  ET_CHECK_OK_OR_RETURN_ERROR(guard->set_index(device_index));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: ret_guard is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_cuda_stream_guard failed: stream is null");
+
+  auto result =
+      CUDAStreamGuard::create(static_cast<cudaStream_t>(stream), device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_guard = new CUDAStreamGuard(std::move(result.get()));
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(
+    CUDAStreamGuardHandle guard) {
+  ET_CHECK_OR_RETURN_ERROR(
+      guard != nullptr,
+      InvalidArgument,
+      "aoti_torch_delete_cuda_stream_guard failed: guard is null");
+
+  delete guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream) {
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_stream != nullptr,
+      InvalidArgument,
+      "aoti_torch_get_current_cuda_stream failed: ret_stream is null");
+
+  auto result = getCurrentCUDAStream(device_index);
+  if (!result.ok()) {
+    return result.error();
+  }
+  *ret_stream = static_cast<void*>(result.get());
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/cuda_guard.h b/backends/cuda/runtime/shims/cuda_guard.h
new file mode 100644
index 00000000000..f930f3df643
--- /dev/null
+++ b/backends/cuda/runtime/shims/cuda_guard.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+using executorch::backends::aoti::AOTITorchError;
+
+extern "C" {
+
+// Handle types for CUDA guards
+using CUDAGuardHandle = CUDAGuard*;
+using CUDAStreamGuardHandle = CUDAStreamGuard*;
+
+/**
+ * Creates a CUDA device guard that sets the current device and restores it
+ * upon destruction.
+ *
+ * @param device_index The device index to set as current
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA device guard and frees its associated resources.
+ *
+ * @param guard Handle to the guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+/**
+ * Sets the CUDA device to a new index for an existing guard.
+ *
+ * @param guard Handle to the guard
+ * @param device_index The device index to set as current
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_cuda_guard_set_index(
+    CUDAGuardHandle guard,
+    int32_t device_index);
+
+/**
+ * Creates a CUDA stream guard that sets the current device and stream,
+ * restoring both upon destruction.
+ *
+ * @param stream The CUDA stream to set as current
+ * @param device_index The device index for the stream
+ * @param ret_guard Output parameter for the created guard handle (must not be
+ * null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard);
+
+/**
+ * Deletes a CUDA stream guard and frees its associated resources.
+ *
+ * @param guard Handle to the stream guard to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+/**
+ * Gets the current CUDA stream for a specified device.
+ *
+ * @param device_index The device index (-1 to use current device)
+ * @param ret_stream Output parameter for the current stream (must not be null)
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_get_current_cuda_stream(
+    int32_t device_index,
+    void** ret_stream);
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
new file mode 100644
index 00000000000..6fe315ba8ee
--- /dev/null
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -0,0 +1,663 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/platform/log.h>
+#include <cstdint>
+#include <cstdlib> // For posix_memalign
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace executorch::backends::cuda {
+
+using executorch::aten::SizesType;
+using executorch::aten::StridesType;
+using executorch::backends::aoti::aoti_torch_get_device_index;
+using executorch::backends::aoti::aoti_torch_get_dtype;
+using executorch::backends::aoti::aoti_torch_get_sizes;
+using executorch::backends::aoti::aoti_torch_get_strides;
+using executorch::backends::aoti::dtype_to_element_size;
+using executorch::backends::aoti::dtype_to_scalar_type;
+using executorch::backends::aoti::validate_storage_offset;
+
+// Global storage for tensors and their metadata
+std::unordered_set<std::shared_ptr<Tensor>> tensors;
+
+// Reference counting for memory addresses
+// Maps memory address to number of tensors using it
+// Special value: NOT_OWN (-1) means tensor never owns the memory
+constexpr int32_t NOT_OWN = -1;
+std::unordered_map<void*, int32_t> memory_to_n_tensor;
+
+namespace {
+
+// Calculate linear offset from strides and indices
+int64_t calculate_linear_offset(
+    const int64_t* indices,
+    const int64_t* strides,
+    int64_t ndim) {
+  int64_t offset = 0;
+  for (int64_t i = 0; i < ndim; ++i) {
+    offset += indices[i] * strides[i];
+  }
+  return offset;
+}
+
+// Convert linear index to multi-dimensional indices based on sizes
+void linear_to_indices(
+    int64_t linear_idx,
+    const int64_t* sizes,
+    int64_t ndim,
+    int64_t* indices) {
+  for (int64_t i = ndim - 1; i >= 0; --i) {
+    indices[i] = linear_idx % sizes[i];
+    linear_idx /= sizes[i];
+  }
+}
+
+// Generic pointwise copy function that handles arbitrary strides
+template <typename T>
+AOTITorchError pointwise_copy_generic(
+    T* dst_data,
+    const T* src_data,
+    const int64_t* dst_sizes,
+    const int64_t* dst_strides,
+    const int64_t* src_sizes,
+    const int64_t* src_strides,
+    int64_t dst_ndim,
+    int64_t src_ndim,
+    int64_t total_elements) {
+  std::vector<int64_t> dst_indices(dst_ndim);
+  std::vector<int64_t> src_indices(src_ndim);
+
+  for (int64_t linear_idx = 0; linear_idx < total_elements; ++linear_idx) {
+    // Convert linear index to multi-dimensional indices for both tensors
+    linear_to_indices(linear_idx, dst_sizes, dst_ndim, dst_indices.data());
+    linear_to_indices(linear_idx, src_sizes, src_ndim, src_indices.data());
+
+    // Calculate offsets for both source and destination
+    int64_t src_offset =
+        calculate_linear_offset(src_indices.data(), src_strides, src_ndim);
+    int64_t dst_offset =
+        calculate_linear_offset(dst_indices.data(), dst_strides, dst_ndim);
+
+    // Copy element
+    dst_data[dst_offset] = src_data[src_offset];
+  }
+
+  return Error::Ok;
+}
+
+} // anonymous namespace
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size) {
+  // TODO(gasoonjia): verify given data is on the target device
+  (void)device_type;
+  (void)opaque_metadata;
+  (void)layout;
+  (void)opaque_metadata_size;
+
+  // Validate input parameters first
+  ET_CHECK_OR_RETURN_ERROR(
+      data != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2 failed: data pointer is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      !(sizes_ptr == nullptr && ndim > 0),
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2 failed: sizes_ptr is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_new_tensor != nullptr,
+      InvalidArgument,
+      "aoti_torch_create_tensor_from_blob_v2 failed: ret_new_tensor is null");
+
+  // Check that device_index is always 0
+  ET_CHECK_OR_RETURN_ERROR(
+      device_index == 0,
+      InvalidArgument,
+      "device_index must be 0, got: %d",
+      device_index);
+
+  // Validate dtype using SupportedDTypes from utils.h
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
+
+  // Storage offset must be 0 since from_blob cannot handle different offsets
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset));
+
+  // Convert sizes to the format expected by ExecutorTorch using SizesType
+  std::vector<executorch::aten::SizesType> sizes =
+      convert_sizes_to_vector(ndim, sizes_ptr);
+
+  // Convert strides using the common helper function with StridesType
+  std::vector<executorch::aten::StridesType> strides =
+      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // Create ExecutorTorch tensor that wraps the existing memory
+  // Note: We're NOT copying the data, just wrapping it
+  auto tensor = executorch::extension::from_blob(
+      data, // existing memory (don't copy!)
+      sizes, // tensor dimensions
+      strides, // tensor strides (allows different strides)
+      dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
+  );
+
+  ET_CHECK_OR_RETURN_ERROR(
+      tensor != nullptr, InvalidArgument, "Failed to create tensor from blob");
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+
+  *ret_new_tensor = tensor.get();
+
+  // Check if this memory address is already being tracked
+  auto memory_it = memory_to_n_tensor.find(data);
+  ET_CHECK_OR_RETURN_ERROR(
+      memory_it == memory_to_n_tensor.end(),
+      InvalidArgument,
+      "Memory address %p is already being tracked by another tensor",
+      data);
+
+  // Mark this memory as NOT_OWN since tensor created from blob never owns
+  // memory
+  memory_to_n_tensor[data] = NOT_OWN;
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor) {
+  // Check that device_index is always 0
+  ET_CHECK_OR_RETURN_ERROR(
+      device_index == 0,
+      InvalidArgument,
+      "device_index must be 0, got: %d",
+      device_index);
+
+  // This requires us to reserve CUDA memory and put it into a ETensor
+  void* ptr;
+  int64_t numel = 1;
+  for (int64_t i = 0; i < ndim; i++) {
+    numel *= sizes_ptr[i];
+  }
+
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
+
+  size_t element_size = dtype_to_element_size(dtype);
+  ET_CHECK_OR_RETURN_ERROR(
+      element_size != 0,
+      InvalidArgument,
+      "Invalid element size for dtype: %d",
+      dtype);
+  int64_t nbytes = numel * element_size;
+
+  if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
+    ET_CUDA_CHECK_OR_RETURN_ERROR(
+        cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
+  } else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
+    // Ensure 16-byte alignment for CPU memory to match CUDA requirements
+    int result = posix_memalign(&ptr, 16, nbytes);
+    ET_CHECK_OR_RETURN_ERROR(
+        result == 0,
+        MemoryAllocationFailed,
+        "Failed to allocate aligned CPU memory");
+    ET_CHECK_OR_RETURN_ERROR(
+        ptr != nullptr,
+        MemoryAllocationFailed,
+        "Failed to call posix_memalign");
+  } else {
+    ET_CHECK_OR_RETURN_ERROR(
+        false,
+        NotImplemented,
+        "Need to implement empty_strided for non-CUDA non-CPU device type %d",
+        device_type);
+  }
+
+  // ETensor sizes
+  auto sizes = convert_sizes_to_vector(ndim, sizes_ptr);
+
+  // ETensor strides
+  auto strides = convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // ETensor creation with dynamic shape support for edge cases
+  auto tensor = executorch::extension::from_blob(
+      ptr, sizes, strides, dtype_to_scalar_type(dtype));
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+  *ret_new_tensor = tensor.get();
+
+  // This tensor owns the memory it allocated, set reference count to 1
+  memory_to_n_tensor[ptr] = 1;
+
+  return Error::Ok;
+}
+
+void clear_all_tensors() {
+  // Use aoti_torch_delete_tensor_object to properly delete each tensor
+  // Note: We need to collect tensor pointers first since deletion modifies the
+  // set
+  std::vector<Tensor*> tensor_ptrs;
+  tensor_ptrs.reserve(tensors.size());
+  for (const auto& tensor_shared : tensors) {
+    tensor_ptrs.push_back(tensor_shared.get());
+  }
+
+  // Now delete each tensor - this will modify the global tensors set
+  for (Tensor* tensor_ptr : tensor_ptrs) {
+    aoti_torch_delete_tensor_object(tensor_ptr);
+  }
+
+  // tensors set should now be empty, but ensure it's cleared
+  tensors.clear();
+
+  ET_LOG(Info, "Cleared all tensors");
+}
+
+AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
+  // Handle null tensor pointer
+  ET_CHECK_OR_RETURN_ERROR(
+      tensor != nullptr, InvalidArgument, "Cannot delete null tensor");
+
+  // Check if tensor exists in our tracking
+  bool found_in_tensors = false;
+  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
+    if (it->get() == tensor) {
+      found_in_tensors = true;
+      break;
+    }
+  }
+
+  // If tensor not found in our tracking, it's invalid
+  ET_CHECK_OR_RETURN_ERROR(
+      found_in_tensors, InvalidArgument, "Didn't find tensor %p", tensor);
+
+  // Find and delete the tensor
+  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
+    if (it->get() == tensor) {
+      // Get the tensor before erasing
+      auto tensor_ptr = *it;
+      void* data_ptr = tensor_ptr->mutable_data_ptr();
+
+      // Find the reference count for this memory address
+      auto memory_it = memory_to_n_tensor.find(data_ptr);
+      if (memory_it != memory_to_n_tensor.end()) {
+        int32_t ref_count = memory_it->second;
+
+        if (ref_count == NOT_OWN) {
+          // Tensor never owned the memory, skip freeing
+          // Just remove tensor from tracking
+          tensors.erase(it);
+          return Error::Ok;
+        } else if (ref_count == 1) {
+          // Only current tensor using this memory, free it
+          // Determine if it's GPU memory
+          cudaPointerAttributes attributes{};
+          ET_CUDA_CHECK_OR_RETURN_ERROR(
+              cudaPointerGetAttributes(&attributes, data_ptr));
+
+          if (attributes.type == cudaMemoryTypeDevice) {
+            ET_CUDA_CHECK_OR_RETURN_ERROR(
+                cudaFreeAsync(data_ptr, cudaStreamDefault));
+          } else {
+            ET_CHECK_OR_RETURN_ERROR(
+                attributes.type != cudaMemoryTypeManaged,
+                Internal,
+                "Expected host memory but got managed!")
+            // This is CPU memory - free immediately
+            free(data_ptr);
+            data_ptr = nullptr;
+          }
+
+          // Remove from memory tracking
+          memory_to_n_tensor.erase(memory_it);
+        } else if (ref_count > 1) {
+          // Other tensors still using this memory, just decrement count
+          memory_to_n_tensor[data_ptr] = ref_count - 1;
+        }
+      } else {
+        ET_CHECK_OR_RETURN_ERROR(
+            false,
+            Internal,
+            "Internal error: memory not found during deletion");
+      }
+
+      // Remove tensor from set (this will call the destructor if it's the last
+      // reference)
+      tensors.erase(it);
+      return Error::Ok;
+    }
+  }
+
+  // This should never be reached since we found it above
+  ET_CHECK_OR_RETURN_ERROR(
+      false, Internal, "Internal error: tensor not found after validation");
+}
+
+AOTITorchError
+aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking) {
+  (void)non_blocking;
+
+  // Check for null pointers first
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch_copy_ failed: self tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      src != nullptr,
+      InvalidArgument,
+      "aoti_torch_copy_ failed: src tensor is null");
+
+  // Get dtype information and validate compatibility
+  int32_t self_dtype, src_dtype;
+  aoti_torch_get_dtype(self, &self_dtype);
+  aoti_torch_get_dtype(src, &src_dtype);
+
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(self_dtype));
+
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(src_dtype));
+
+  // Check dtype compatibility - both tensors must have the same dtype
+  ET_CHECK_OR_RETURN_ERROR(
+      self_dtype == src_dtype,
+      InvalidArgument,
+      "dtype mismatch. self.dtype=%d, src.dtype=%d. aoti_torch_copy_ requires same dtypes",
+      self_dtype,
+      src_dtype);
+
+  // Check total number of elements compatibility (PyTorch copy_ behavior)
+  int64_t self_numel = self->numel();
+  int64_t src_numel = src->numel();
+
+  ET_CHECK_OR_RETURN_ERROR(
+      self_numel == src_numel,
+      InvalidArgument,
+      "numel mismatch. self.numel()=%ld, src.numel()=%ld",
+      self_numel,
+      src_numel);
+
+  // Get tensor metadata
+  int64_t* self_strides;
+  int64_t* src_strides;
+  aoti_torch_get_strides(self, &self_strides);
+  aoti_torch_get_strides(src, &src_strides);
+
+  int64_t* self_sizes;
+  int64_t* src_sizes;
+  aoti_torch_get_sizes(self, &self_sizes);
+  aoti_torch_get_sizes(src, &src_sizes);
+
+  // Determine device locations
+  cudaPointerAttributes srcAttributes{};
+  cudaPointerAttributes dstAttributes{};
+
+  ET_CUDA_CHECK_OR_RETURN_ERROR(
+      cudaPointerGetAttributes(&srcAttributes, src->data_ptr()));
+
+  ET_CUDA_CHECK_OR_RETURN_ERROR(
+      cudaPointerGetAttributes(&dstAttributes, self->data_ptr()));
+
+  bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
+  bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
+
+  // Check if tensors have the same schema (sizes, strides, dtype) for fast path
+  bool same_schema = true;
+  for (int i = 0; i < self->dim(); i++) {
+    if (self_strides[i] != src_strides[i]) {
+      same_schema = false;
+      break;
+    }
+  }
+
+  size_t total_bytes = src->nbytes();
+  int64_t total_elements = self->numel();
+
+  if (same_schema) {
+    // Fast path: Direct memory copy since layouts match exactly
+    if (srcIsDevice && dstIsDevice) {
+      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
+          self->mutable_data_ptr(),
+          src->data_ptr(),
+          total_bytes,
+          cudaMemcpyDeviceToDevice));
+    } else if (srcIsDevice && !dstIsDevice) {
+      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
+          self->mutable_data_ptr(),
+          src->data_ptr(),
+          total_bytes,
+          cudaMemcpyDeviceToHost));
+    } else if (!srcIsDevice && dstIsDevice) {
+      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
+          self->mutable_data_ptr(),
+          src->data_ptr(),
+          total_bytes,
+          cudaMemcpyHostToDevice));
+    } else {
+      std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes);
+    }
+  } else {
+    // Fallback path: Pointwise copy with stride-aware indexing
+    // This handles arbitrary tensor layouts and strides
+
+    size_t element_size = dtype_to_element_size(self_dtype);
+    ET_CHECK_OR_RETURN_ERROR(
+        element_size != 0,
+        InvalidArgument,
+        "Invalid element size for dtype: %d",
+        self_dtype);
+
+    // Allocate temporary host memory for GPU tensors
+    float* src_host_data = nullptr;
+    float* dst_host_data = nullptr;
+    bool need_free_src = false;
+    bool need_free_dst = false;
+
+    if (srcIsDevice) {
+      src_host_data =
+          static_cast<float*>(malloc(total_elements * sizeof(float)));
+      ET_CHECK_OR_RETURN_ERROR(
+          src_host_data != nullptr,
+          MemoryAllocationFailed,
+          "Failed to allocate memory for src_host_data");
+      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
+          src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost));
+      need_free_src = true;
+    } else {
+      src_host_data = static_cast<float*>(src->data_ptr());
+    }
+
+    if (dstIsDevice) {
+      dst_host_data =
+          static_cast<float*>(malloc(total_elements * sizeof(float)));
+      if (dst_host_data == nullptr) {
+        if (need_free_src) {
+          free(src_host_data);
+        }
+        ET_CHECK_OR_RETURN_ERROR(
+            false,
+            MemoryAllocationFailed,
+            "Failed to allocate memory for dst_host_data");
+      }
+      need_free_dst = true;
+    } else {
+      dst_host_data = static_cast<float*>(self->mutable_data_ptr());
+    }
+
+    // Perform pointwise copy with stride calculation
+    AOTITorchError copy_err = pointwise_copy_generic(
+        dst_host_data,
+        src_host_data,
+        self_sizes,
+        self_strides,
+        src_sizes,
+        src_strides,
+        self->dim(),
+        src->dim(),
+        total_elements);
+
+    if (copy_err != Error::Ok) {
+      // Clean up temporary buffers before returning
+      if (need_free_src) {
+        free(src_host_data);
+      }
+      if (need_free_dst) {
+        free(dst_host_data);
+      }
+      return copy_err;
+    }
+
+    // Copy result back to device if needed
+    if (dstIsDevice) {
+      ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
+          self->mutable_data_ptr(),
+          dst_host_data,
+          total_bytes,
+          cudaMemcpyHostToDevice));
+    }
+
+    // Clean up temporary buffers
+    if (need_free_src) {
+      free(src_host_data);
+    }
+    if (need_free_dst) {
+      free(dst_host_data);
+    }
+  }
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch__reinterpret_tensor(
+    Tensor* self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    Tensor** ret_new_tensor) {
+  // Validate input parameters first
+  ET_CHECK_OR_RETURN_ERROR(
+      self != nullptr,
+      InvalidArgument,
+      "aoti_torch__reinterpret_tensor failed: self tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      !(sizes_ptr == nullptr && ndim > 0),
+      InvalidArgument,
+      "aoti_torch__reinterpret_tensor failed: sizes_ptr is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      ret_new_tensor != nullptr,
+      InvalidArgument,
+      "aoti_torch__reinterpret_tensor failed: ret_new_tensor is null");
+
+  // Check if storage_offset is not 0 - return error if not
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset));
+
+  // Get the device info from the source tensor to perform device_index
+  // validation
+  int32_t device_type = 0;
+  int32_t device_index = 0;
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type));
+
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index));
+
+  // Ensure device_index is always 0
+  ET_CHECK_OR_RETURN_ERROR(
+      device_index == 0,
+      InvalidArgument,
+      "device_index must be 0, got: %d",
+      device_index);
+
+  // Get the dtype from the source tensor
+  int32_t dtype = 0;
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(self, &dtype));
+
+  // Validate dtype using SupportedDTypes
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
+
+  // Get the original data pointer from the source tensor
+  void* data_ptr = self->mutable_data_ptr();
+  ET_CHECK_OR_RETURN_ERROR(
+      data_ptr != nullptr,
+      InvalidArgument,
+      "Source tensor has null data pointer");
+
+  // Check if the given memory is in the map, if not return error
+  auto memory_it = memory_to_n_tensor.find(data_ptr);
+  ET_CHECK_OR_RETURN_ERROR(
+      memory_it != memory_to_n_tensor.end(),
+      InvalidArgument,
+      "Memory address %p is not being tracked by reference counting system",
+      data_ptr);
+
+  // Convert sizes using utility function from utils.h
+  std::vector<SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
+
+  // Convert strides using utility function from utils.h
+  std::vector<StridesType> strides =
+      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // Create new tensor view that reinterprets the same memory with different
+  // shape/strides This creates a view, not a copy - the data pointer is shared
+  std::shared_ptr<Tensor> tensor = executorch::extension::from_blob(
+      data_ptr, // Reuse the same memory from source tensor
+      sizes, // New sizes with explicit SizesType
+      strides, // New strides with explicit StridesType
+      dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
+  );
+
+  ET_CHECK_OR_RETURN_ERROR(
+      tensor != nullptr,
+      InvalidArgument,
+      "Failed to create reinterpreted tensor view");
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+
+  *ret_new_tensor = tensor.get();
+
+  // Increment the reference count for this memory address only if it is owned
+  // by tensor
+  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
+      ? NOT_OWN
+      : memory_to_n_tensor[data_ptr] + 1;
+
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
new file mode 100644
index 00000000000..7a8d4c3609b
--- /dev/null
+++ b/backends/cuda/runtime/shims/memory.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+using executorch::backends::aoti::AOTITorchError;
+using executorch::backends::aoti::Tensor;
+
+extern "C" {
+
+/**
+ * Creates a tensor object from an existing memory blob without copying the
+ * data. The tensor will wrap the provided memory and will not take ownership of
+ * it. When the tensor is deleted, the original memory will remain valid and
+ * must be freed by the caller.
+ *
+ * @param data Pointer to the memory blob to wrap (must not be null)
+ * @param ndim Number of dimensions in the tensor
+ * @param sizes_ptr Pointer to array of dimension sizes (using SizesType)
+ * @param strides_ptr Pointer to array of strides for each dimension (using
+ * StridesType, can be null for contiguous)
+ * @param storage_offset Storage offset (must be 0 for current implementation)
+ * @param dtype Data type identifier (supports FLOAT32 and BFLOAT16 from
+ * SupportedDTypes)
+ * @param device_type Device type (CPU=0, CUDA=1 from SupportedDevices)
+ * @param device_index Device index (must be 0 for current implementation)
+ * @param ret_new_tensor Output parameter for the created tensor (must not be
+ * null)
+ * @param layout Tensor layout identifier (0=strided)
+ * @param opaque_metadata Optional metadata pointer (can be null)
+ * @param opaque_metadata_size Size of opaque metadata in bytes
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
+
+/**
+ * Creates an uninitialized tensor with specified dimensions, strides, and
+ * dtyper on either CPU or CUDA device.
+ *
+ * @param ndim Number of dimensions in the tensor
+ * @param sizes_ptr Pointer to array of dimension sizes
+ * @param strides_ptr Pointer to array of strides for each dimension
+ * @param dtype Data type identifier (matches PyTorch scalar types)
+ * @param device_type Device type (0=CPU, 1=CUDA)
+ * @param device_index Device index (must be 0 for current implementation)
+ * @param ret_new_tensor Output parameter for the created tensor
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    Tensor** ret_new_tensor);
+
+/**
+ * Deletes a tensor object and frees its associated memory.
+ *
+ * @param tensor Pointer to the tensor object to be deleted
+ * @return AOTITorchError error code (Error::Ok on success, or an error code on
+ * failure)
+ */
+AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor);
+
+/**
+ * Creates a tensor view that reinterprets the same underlying memory with
+ * different shape and strides without copying data.
+ *
+ * Note that the new tensor will not have the ownership of the underlying
+ * memory.
+ *
+ * @param self Input tensor whose memory will be reinterpreted
+ * @param ndim Number of dimensions for the new tensor view
+ * @param sizes_ptr Array of sizes for each dimension
+ * @param strides_ptr Array of strides for each dimension (or nullptr for
+ * contiguous)
+ * @param storage_offset Storage offset (must be 0)
+ * @param ret_new_tensor Output pointer to store the new tensor view
+ *
+ * @return Error::Ok on success, appropriate error code on failure
+ */
+AOTITorchError aoti_torch__reinterpret_tensor(
+    Tensor* self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    Tensor** ret_new_tensor);
+
+/**
+ * Copies data from source tensor to destination tensor.
+ *
+ * This function implements copy function for tensors living in CUDA AOTI
+ * backend. It supports copying between tensors with different shapes (as long
+ * as they have the same total number of elements) and different memory
+ * layouts/strides.
+ *
+ * Note that currently this function does not support copying between tensors
+ * with different dtypes.
+ *
+ * @param self Destination tensor (data will be overwritten)
+ * @param src Source tensor (data will be copied from this tensor)
+ * @param non_blocking Whether the copy should be non-blocking (currently
+ * ignored)
+ *
+ * @return Error::Ok on success, appropriate error code on failure:
+ *         - Error::InvalidArgument: null pointers, dtype mismatch, numel
+ * mismatch
+ *         - Error::MemoryAllocationFailed: failed to allocate temporary memory
+ *         - Error::Internal: CUDA operation failures
+ */
+AOTITorchError
+aoti_torch_copy_(Tensor* self, Tensor* src, int32_t non_blocking);
+
+// Function to clear all tensors from internal storage
+void clear_all_tensors();
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp
new file mode 100644
index 00000000000..1a14c79f9f2
--- /dev/null
+++ b/backends/cuda/runtime/shims/tensor_attribute.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+
+namespace executorch::backends::cuda {
+
+extern "C" {
+
+// Device type functions for tensor attributes
+AOTITorchError aoti_torch_get_device_type(
+    Tensor* tensor,
+    int32_t* ret_device_type) {
+  // All tensors in aoti-cuda delegate are on CUDA
+  *ret_device_type = aoti_torch_device_type_cuda();
+  return Error::Ok;
+}
+
+// Device type constants
+int32_t aoti_torch_device_type_cuda() {
+  // Let's say cuda is 1 for ET as well
+  return 1;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
new file mode 100644
index 00000000000..15a4e397d24
--- /dev/null
+++ b/backends/cuda/runtime/shims/tensor_attribute.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <cstdint>
+
+namespace executorch::backends::cuda {
+
+// Common using declarations for ExecutorTorch types
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Common AOTI type aliases
+using AOTITorchError = Error;
+
+// Device type functions for tensor attributes
+AOTITorchError aoti_torch_get_device_type(
+    Tensor* tensor,
+    int32_t* ret_device_type);
+
+// Device type constants
+int32_t aoti_torch_device_type_cuda();
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/tests/TARGETS b/backends/cuda/runtime/shims/tests/TARGETS
new file mode 100644
index 00000000000..9ff3e83a8bd
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/TARGETS
@@ -0,0 +1,6 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl
new file mode 100644
index 00000000000..70f27b86bec
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/targets.bzl
@@ -0,0 +1,35 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
+
+def cuda_shim_cpp_unittest(name):
+    cpp_unittest(
+        name = "test_" + name,
+        srcs = [
+            "test_" + name + ".cpp",
+        ],
+        deps = [
+            "//executorch/backends/aoti:common_shims",
+            "//executorch/backends/cuda/runtime:runtime_shims",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/platform:platform",
+            "//executorch/runtime/core/exec_aten:lib",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    cuda_shim_cpp_unittest("aoti_torch_empty_strided")
+    cuda_shim_cpp_unittest("aoti_torch_delete_tensor_object")
+    cuda_shim_cpp_unittest("aoti_torch_create_tensor_from_blob_v2")
+    cuda_shim_cpp_unittest("aoti_torch__reinterpret_tensor")
+    cuda_shim_cpp_unittest("aoti_torch_copy_")
+    cuda_shim_cpp_unittest("aoti_torch_cuda_guard")
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
new file mode 100644
index 00000000000..e18bf142b5c
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch__reinterpret_tensor.cpp
@@ -0,0 +1,810 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+using executorch::runtime::etensor::Tensor;
+
+// Test fixture for aoti_torch__reinterpret_tensor tests
+class AOTITorchReinterpretTensorTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize ExecuTorch Platform Abstraction Layer
+    et_pal_init();
+
+    // Check if CUDA is available
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    // Clean up any existing cached metadata before each test
+    cleanup_tensor_metadata();
+
+    // Clear any remaining tensors from previous tests
+    clear_all_tensors();
+  }
+
+  void TearDown() override {
+    // Clean up metadata
+    cleanup_tensor_metadata();
+
+    // Clear the global tensor storage using the provided function
+    clear_all_tensors();
+  }
+
+  // Helper to calculate number of elements from sizes
+  int64_t calculate_numel(const std::vector<int64_t>& sizes) {
+    int64_t numel = 1;
+    for (int64_t size : sizes) {
+      numel *= size;
+    }
+    return numel;
+  }
+
+  // Helper to calculate contiguous strides from sizes
+  std::vector<int64_t> calculate_contiguous_strides(
+      const std::vector<int64_t>& sizes) {
+    std::vector<int64_t> strides(sizes.size());
+    if (sizes.empty()) {
+      return strides;
+    }
+
+    strides[sizes.size() - 1] = 1;
+    for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+    return strides;
+  }
+
+  // Helper to create a source tensor using empty_strided (which allocates new
+  // memory)
+  Tensor* create_source_tensor(
+      const std::vector<int64_t>& sizes,
+      int32_t dtype = 6, // float32
+      int32_t device_type = 1, // CUDA
+      int32_t device_index = 0) {
+    std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+    Tensor* tensor;
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        dtype,
+        device_type,
+        device_index,
+        &tensor);
+
+    if (error != Error::Ok) {
+      return nullptr;
+    }
+
+    return tensor;
+  }
+
+ private:
+  std::vector<void*> cuda_memory_buffers_;
+  std::vector<void*> cpu_memory_buffers_;
+};
+
+// Test basic functionality: reinterpret tensor with different shapes
+TEST_F(AOTITorchReinterpretTensorTest, BasicReinterpretation) {
+  // Create a source tensor with shape [12] (1D with 12 elements)
+  std::vector<int64_t> source_sizes = {12};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  // Store the original data pointer
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+  ASSERT_NE(original_data_ptr, nullptr);
+
+  // Reinterpret as [3, 4] (2D with same number of elements)
+  std::vector<int64_t> new_sizes = {3, 4};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check that the reinterpreted tensor has the new shape
+  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
+  EXPECT_EQ(reinterpreted_tensor->size(0), 3);
+  EXPECT_EQ(reinterpreted_tensor->size(1), 4);
+
+  // CRITICAL: Check that the reinterpreted tensor uses the SAME memory
+  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
+  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
+      << "Reinterpreted tensor should use the same memory as the source tensor";
+
+  // Write data through the original tensor and verify it's visible through the
+  // reinterpreted tensor
+  std::vector<float> test_data = {
+      1.0f,
+      2.0f,
+      3.0f,
+      4.0f,
+      5.0f,
+      6.0f,
+      7.0f,
+      8.0f,
+      9.0f,
+      10.0f,
+      11.0f,
+      12.0f};
+  cudaError_t cuda_err = cudaMemcpy(
+      original_data_ptr,
+      test_data.data(),
+      test_data.size() * sizeof(float),
+      cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+
+  // Read back through the reinterpreted tensor
+  std::vector<float> readback_data(12);
+  cuda_err = cudaMemcpy(
+      readback_data.data(),
+      reinterpreted_data_ptr,
+      readback_data.size() * sizeof(float),
+      cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+
+  // Verify the data matches
+  for (size_t i = 0; i < test_data.size(); i++) {
+    EXPECT_EQ(readback_data[i], test_data[i])
+        << "Data should be the same through both tensors at index " << i;
+  }
+}
+
+// Test reinterpreting with different strides
+TEST_F(AOTITorchReinterpretTensorTest, ReinterpretWithCustomStrides) {
+  // Create a source tensor with shape [2, 6] (contiguous)
+  std::vector<int64_t> source_sizes = {2, 6};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+  ASSERT_NE(original_data_ptr, nullptr);
+
+  // Reinterpret as [3, 4] with custom strides (still valid for the same memory)
+  std::vector<int64_t> new_sizes = {3, 4};
+  std::vector<int64_t> new_strides = {4, 1}; // Row-major strides for [3, 4]
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check shape
+  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
+  EXPECT_EQ(reinterpreted_tensor->size(0), 3);
+  EXPECT_EQ(reinterpreted_tensor->size(1), 4);
+
+  // CRITICAL: Check that the reinterpreted tensor uses the SAME memory
+  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
+  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
+      << "Reinterpreted tensor should use the same memory as the source tensor";
+
+  // Verify strides were set correctly
+  int64_t* tensor_strides;
+  error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(tensor_strides[0], 4);
+  EXPECT_EQ(tensor_strides[1], 1);
+}
+
+// Test error cases: null input tensor
+TEST_F(AOTITorchReinterpretTensorTest, NullInputTensor) {
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      nullptr, // null input tensor
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test error cases: null sizes pointer
+TEST_F(AOTITorchReinterpretTensorTest, NullSizesPointer) {
+  std::vector<int64_t> source_sizes = {6};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  std::vector<int64_t> new_strides = {2, 1};
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      2, // ndim > 0
+      nullptr, // null sizes pointer
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test error cases: null return tensor pointer
+TEST_F(AOTITorchReinterpretTensorTest, NullReturnTensorPointer) {
+  std::vector<int64_t> source_sizes = {6};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      nullptr); // null return tensor pointer
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test error cases: non-zero storage offset (should fail)
+TEST_F(AOTITorchReinterpretTensorTest, NonZeroStorageOffset) {
+  std::vector<int64_t> source_sizes = {6};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      1, // non-zero storage_offset (should fail)
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test reinterpreting CPU tensor
+TEST_F(AOTITorchReinterpretTensorTest, ReinterpretCPUTensor) {
+  // Create a CPU tensor with shape [8]
+  std::vector<int64_t> source_sizes = {8};
+  Tensor* source_tensor = create_source_tensor(
+      source_sizes,
+      6, // float32
+      0, // CPU device
+      0);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+  ASSERT_NE(original_data_ptr, nullptr);
+
+  // Reinterpret as [2, 4]
+  std::vector<int64_t> new_sizes = {2, 4};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check that the reinterpreted tensor uses the SAME memory
+  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
+  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
+      << "Reinterpreted CPU tensor should use the same memory as the source tensor";
+
+  // Test direct memory access for CPU tensors
+  float* original_float_ptr = reinterpret_cast<float*>(original_data_ptr);
+  float* reinterpreted_float_ptr =
+      reinterpret_cast<float*>(reinterpreted_data_ptr);
+
+  // Write through original and read through reinterpreted
+  original_float_ptr[0] = 42.0f;
+  EXPECT_EQ(reinterpreted_float_ptr[0], 42.0f)
+      << "Changes through original tensor should be visible through reinterpreted tensor";
+}
+
+// Test that deleting source tensor doesn't affect reinterpreted tensor (they
+// share memory)
+TEST_F(AOTITorchReinterpretTensorTest, DeletionBehavior) {
+  std::vector<int64_t> source_sizes = {6};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* shared_data_ptr = source_tensor->mutable_data_ptr();
+
+  // Reinterpret as [2, 3]
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0,
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Verify they share the same memory
+  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), shared_data_ptr);
+
+  // Delete the source tensor (which owns the memory)
+  error = aoti_torch_delete_tensor_object(source_tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // The reinterpreted tensor should still be valid but the memory might be
+  // freed Since the source tensor owned the memory, the reinterpreted tensor
+  // becomes invalid This is expected behavior - the user needs to manage the
+  // lifecycle properly
+
+  // Clean up the reinterpreted tensor
+  error = aoti_torch_delete_tensor_object(reinterpreted_tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test scalar tensor reinterpretation
+TEST_F(AOTITorchReinterpretTensorTest, ReinterpretScalarTensor) {
+  // Create a scalar tensor (0D)
+  std::vector<int64_t> source_sizes = {};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+
+  // Try to reinterpret scalar as [1] (1D with 1 element)
+  std::vector<int64_t> new_sizes = {1};
+  std::vector<int64_t> new_strides = {1};
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0,
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check that the reinterpreted tensor uses the SAME memory
+  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr);
+
+  // Check new shape
+  EXPECT_EQ(reinterpreted_tensor->dim(), 1);
+  EXPECT_EQ(reinterpreted_tensor->size(0), 1);
+}
+
+// Test reinterpreting tensor with zero-sized dimension
+// TODO: This test is disabled because zero-sized tensors have complex stride
+// validation requirements that need further investigation
+TEST_F(AOTITorchReinterpretTensorTest, DISABLED_ReinterpretZeroSizedTensor) {
+  // Create a tensor with shape [0, 5] (zero elements)
+  std::vector<int64_t> source_sizes = {0, 5};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+
+  // Reinterpret as [5, 0] (still zero elements)
+  std::vector<int64_t> new_sizes = {5, 0};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0,
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check that the reinterpreted tensor uses the SAME memory
+  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr);
+
+  // Check new shape
+  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
+  EXPECT_EQ(reinterpreted_tensor->size(0), 5);
+  EXPECT_EQ(reinterpreted_tensor->size(1), 0);
+}
+
+// Test with nullptr strides (should use contiguous strides)
+TEST_F(AOTITorchReinterpretTensorTest, NullStridesPointer) {
+  std::vector<int64_t> source_sizes = {12};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+
+  // Reinterpret as [3, 4] with null strides (should calculate contiguous
+  // strides)
+  std::vector<int64_t> new_sizes = {3, 4};
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      nullptr, // null strides - should calculate contiguous strides
+      0,
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check that the reinterpreted tensor uses the SAME memory
+  EXPECT_EQ(reinterpreted_tensor->mutable_data_ptr(), original_data_ptr);
+
+  // Check that contiguous strides were calculated correctly
+  int64_t* tensor_strides;
+  error = aoti_torch_get_strides(reinterpreted_tensor, &tensor_strides);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(tensor_strides[0], 4); // stride for dimension 0 should be 4
+  EXPECT_EQ(tensor_strides[1], 1); // stride for dimension 1 should be 1
+}
+
+// Test bf16 tensor reinterpretation
+TEST_F(AOTITorchReinterpretTensorTest, ReinterpretBF16Tensor) {
+  // Create a bf16 source tensor with shape [6]
+  std::vector<int64_t> source_sizes = {6};
+  Tensor* source_tensor = create_source_tensor(
+      source_sizes,
+      static_cast<int32_t>(
+          SupportedDTypes::BFLOAT16), // bf16 dtype from SupportedDTypes
+      static_cast<int32_t>(
+          SupportedDevices::CUDA), // CUDA device from SupportedDevices
+      0); // device_index must be 0
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* original_data_ptr = source_tensor->mutable_data_ptr();
+  ASSERT_NE(original_data_ptr, nullptr);
+
+  // Verify the tensor is actually bf16
+  int32_t actual_dtype = 0;
+  AOTITorchError dtype_check_error =
+      aoti_torch_get_dtype(source_tensor, &actual_dtype);
+  EXPECT_EQ(dtype_check_error, Error::Ok);
+  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
+      << "Source tensor should have bfloat16 dtype";
+
+  // Reinterpret as [2, 3] (same number of elements)
+  std::vector<int64_t> new_sizes = {2, 3};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(reinterpreted_tensor, nullptr);
+
+  // Check that the reinterpreted tensor has the new shape
+  EXPECT_EQ(reinterpreted_tensor->dim(), 2);
+  EXPECT_EQ(reinterpreted_tensor->size(0), 2);
+  EXPECT_EQ(reinterpreted_tensor->size(1), 3);
+
+  // Verify the dtype is preserved as bf16
+  int32_t reinterpreted_dtype = 0;
+  dtype_check_error =
+      aoti_torch_get_dtype(reinterpreted_tensor, &reinterpreted_dtype);
+  EXPECT_EQ(dtype_check_error, Error::Ok);
+  EXPECT_EQ(
+      reinterpreted_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
+      << "Reinterpreted tensor should preserve bfloat16 dtype";
+
+  // CRITICAL: Check that the reinterpreted tensor uses the SAME memory
+  void* reinterpreted_data_ptr = reinterpreted_tensor->mutable_data_ptr();
+  EXPECT_EQ(reinterpreted_data_ptr, original_data_ptr)
+      << "Reinterpreted tensor should use the same memory as the source tensor";
+
+  // Test memory sharing by writing data through the original tensor
+  // and verifying it's visible through the reinterpreted tensor
+  // Note: bf16 has 2 bytes per element
+  std::vector<uint16_t> test_data_bf16 = {
+      0x3F80, 0x4000, 0x4040, 0x4080, 0x40A0, 0x40C0}; // bf16 values
+  cudaError_t cuda_err = cudaMemcpy(
+      original_data_ptr,
+      test_data_bf16.data(),
+      test_data_bf16.size() * sizeof(uint16_t),
+      cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+
+  // Read back through the reinterpreted tensor
+  std::vector<uint16_t> readback_data_bf16(6);
+  cuda_err = cudaMemcpy(
+      readback_data_bf16.data(),
+      reinterpreted_data_ptr,
+      readback_data_bf16.size() * sizeof(uint16_t),
+      cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+
+  // Verify the data matches
+  for (size_t i = 0; i < test_data_bf16.size(); i++) {
+    EXPECT_EQ(readback_data_bf16[i], test_data_bf16[i])
+        << "BF16 data should be the same through both tensors at index " << i;
+  }
+}
+
+// Test reference counting behavior - memory not in map should fail
+TEST_F(AOTITorchReinterpretTensorTest, MemoryNotInMapShouldFail) {
+  // Create a tensor directly without using our allocation functions
+  // This should NOT be in the reference counting map
+  void* external_memory;
+  ASSERT_EQ(
+      cudaMallocManaged(&external_memory, 12 * sizeof(float)), cudaSuccess);
+
+  // Create a tensor by manually wrapping this memory without going through our
+  // APIs
+  std::vector<int64_t> sizes = {12};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  // Create the tensor directly using ExecutorTorch extension
+  auto tensor_shared = executorch::extension::from_blob(
+      external_memory,
+      convert_sizes_to_vector(sizes.size(), sizes.data()),
+      convert_strides_to_vector(sizes.size(), sizes.data(), strides.data()),
+      executorch::runtime::etensor::ScalarType::Float);
+
+  ASSERT_TRUE(tensor_shared);
+  Tensor* external_tensor = tensor_shared.get();
+
+  // Try to reinterpret this tensor - should fail because memory is not in map
+  std::vector<int64_t> new_sizes = {3, 4};
+  std::vector<int64_t> new_strides = calculate_contiguous_strides(new_sizes);
+
+  Tensor* reinterpreted_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      external_tensor,
+      new_sizes.size(),
+      new_sizes.data(),
+      new_strides.data(),
+      0, // storage_offset
+      &reinterpreted_tensor);
+
+  // Should fail because memory is not being tracked by reference counting
+  // system
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  // Clean up the external memory
+  ASSERT_EQ(cudaFree(external_memory), cudaSuccess);
+}
+
+// Test reference counting behavior - creating view increments reference count
+TEST_F(AOTITorchReinterpretTensorTest, ViewCreationIncrementsReferenceCount) {
+  // Create a source tensor that owns memory (reference count = 1)
+  std::vector<int64_t> source_sizes = {12};
+  Tensor* source_tensor = create_source_tensor(source_sizes);
+  ASSERT_NE(source_tensor, nullptr);
+
+  void* shared_data_ptr = source_tensor->mutable_data_ptr();
+  ASSERT_NE(shared_data_ptr, nullptr);
+
+  // Create first view - should increment reference count to 2
+  std::vector<int64_t> view1_sizes = {3, 4};
+  std::vector<int64_t> view1_strides =
+      calculate_contiguous_strides(view1_sizes);
+
+  Tensor* view1_tensor;
+  AOTITorchError error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      view1_sizes.size(),
+      view1_sizes.data(),
+      view1_strides.data(),
+      0,
+      &view1_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(view1_tensor, nullptr);
+  EXPECT_EQ(view1_tensor->mutable_data_ptr(), shared_data_ptr);
+
+  // Create second view - should increment reference count to 3
+  std::vector<int64_t> view2_sizes = {2, 6};
+  std::vector<int64_t> view2_strides =
+      calculate_contiguous_strides(view2_sizes);
+
+  Tensor* view2_tensor;
+  error = aoti_torch__reinterpret_tensor(
+      source_tensor,
+      view2_sizes.size(),
+      view2_sizes.data(),
+      view2_strides.data(),
+      0,
+      &view2_tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(view2_tensor, nullptr);
+  EXPECT_EQ(view2_tensor->mutable_data_ptr(), shared_data_ptr);
+
+  // Now delete the source tensor - memory should NOT be freed (reference count
+  // = 2)
+  error = aoti_torch_delete_tensor_object(source_tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Both views should still be valid - test by accessing memory
+  float test_value = 42.0f;
+  cudaError_t cuda_err = cudaMemcpy(
+      shared_data_ptr, &test_value, sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+
+  float readback_value = 0.0f;
+  cuda_err = cudaMemcpy(
+      &readback_value,
+      view1_tensor->mutable_data_ptr(),
+      sizeof(float),
+      cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_EQ(readback_value, test_value);
+
+  // Delete first view - memory should still NOT be freed (reference count = 1)
+  error = aoti_torch_delete_tensor_object(view1_tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Second view should still be valid
+  readback_value = 0.0f;
+  cuda_err = cudaMemcpy(
+      &readback_value,
+      view2_tensor->mutable_data_ptr(),
+      sizeof(float),
+      cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_EQ(readback_value, test_value);
+
+  // Delete second view - NOW memory should be freed (reference count = 0)
+  error = aoti_torch_delete_tensor_object(view2_tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test reference counting behavior with NOT_OWN memory (from blob) - should
+// SUCCEED and keep NOT_OWN
+TEST_F(AOTITorchReinterpretTensorTest, ViewOfNotOwnMemoryKeepsNotOwnStatus) {
+  // Allocate external memory
+  void* external_memory;
+  cudaError_t cuda_err =
+      cudaMallocManaged(&external_memory, 12 * sizeof(float));
+  ASSERT_EQ(cuda_err, cudaSuccess);
+
+  // Create tensor from blob (which marks memory as NOT_OWN)
+  std::vector<int64_t> blob_sizes = {12};
+  std::vector<int64_t> blob_strides = calculate_contiguous_strides(blob_sizes);
+
+  Tensor* blob_tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      external_memory,
+      blob_sizes.size(),
+      blob_sizes.data(),
+      blob_strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device_index
+      &blob_tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(blob_tensor, nullptr);
+
+  // Create view of NOT_OWN memory - should SUCCEED and keep NOT_OWN status
+  std::vector<int64_t> view_sizes = {3, 4};
+  std::vector<int64_t> view_strides = calculate_contiguous_strides(view_sizes);
+
+  Tensor* view_tensor;
+  error = aoti_torch__reinterpret_tensor(
+      blob_tensor,
+      view_sizes.size(),
+      view_sizes.data(),
+      view_strides.data(),
+      0,
+      &view_tensor);
+
+  // Should succeed - NOT_OWN memory can be reinterpreted but stays NOT_OWN
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(view_tensor, nullptr);
+  EXPECT_EQ(view_tensor->mutable_data_ptr(), external_memory);
+
+  // Verify both tensors share the same memory
+  EXPECT_EQ(blob_tensor->mutable_data_ptr(), view_tensor->mutable_data_ptr());
+
+  // Test memory sharing by writing data through one tensor and reading through
+  // the other
+  float test_value = 42.0f;
+  cuda_err = cudaMemcpy(
+      external_memory, &test_value, sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+
+  float readback_value = 0.0f;
+  cuda_err = cudaMemcpy(
+      &readback_value,
+      view_tensor->mutable_data_ptr(),
+      sizeof(float),
+      cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_EQ(readback_value, test_value);
+
+  // Delete the blob tensor - external memory should NOT be freed (NOT_OWN
+  // behavior)
+  error = aoti_torch_delete_tensor_object(blob_tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // View tensor should still be valid - test by accessing memory
+  readback_value = 0.0f;
+  cuda_err = cudaMemcpy(
+      &readback_value,
+      view_tensor->mutable_data_ptr(),
+      sizeof(float),
+      cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_EQ(readback_value, test_value);
+
+  // Delete view tensor - external memory should still NOT be freed (NOT_OWN
+  // behavior)
+  error = aoti_torch_delete_tensor_object(view_tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // External memory should still be accessible (proves neither tensor freed it)
+  readback_value = 0.0f;
+  cuda_err = cudaMemcpy(
+      &readback_value, external_memory, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess);
+  EXPECT_EQ(readback_value, test_value);
+
+  // Clean up external memory manually (as expected for NOT_OWN memory)
+  ASSERT_EQ(cudaFree(external_memory), cudaSuccess);
+}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
new file mode 100644
index 00000000000..9fca0f92cf8
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_copy_.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include <vector>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::backends::aoti;
+using namespace executorch::runtime;
+
+// Test fixture for aoti_torch_copy_ tests
+class AOTITorchCopyTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize ExecuTorch Platform Abstraction Layer
+    et_pal_init();
+
+    // Check if CUDA is available
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    // Clean up any existing cached metadata before each test
+    cleanup_tensor_metadata();
+
+    // Clear any remaining tensors from previous tests
+    clear_all_tensors();
+  }
+
+  void TearDown() override {
+    // Clean up metadata
+    cleanup_tensor_metadata();
+
+    // Clear the global tensor storage using the provided function
+    clear_all_tensors();
+  }
+
+  // Helper to create test tensors with specific data
+  Tensor* create_test_tensor_with_data(
+      const std::vector<int64_t>& sizes,
+      const std::vector<float>& data,
+      const std::vector<int64_t>& strides = {},
+      int32_t dtype = static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      int32_t device_type = static_cast<int32_t>(SupportedDevices::CUDA),
+      int32_t device_index = 0) {
+    Tensor* tensor;
+
+    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides_ptr,
+        dtype,
+        device_type,
+        device_index,
+        &tensor);
+
+    if (error != Error::Ok || tensor == nullptr) {
+      return nullptr;
+    }
+
+    // Fill tensor with data
+    size_t total_bytes = data.size() * sizeof(float);
+    if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
+      cudaError_t memcpy_err = cudaMemcpy(
+          tensor->mutable_data_ptr(),
+          data.data(),
+          total_bytes,
+          cudaMemcpyHostToDevice);
+      // Note: Error is checked but we don't fail the function
+      // This allows tests to proceed and handle errors as needed
+      (void)memcpy_err; // Suppress unused variable warning
+    } else { // CPU
+      std::memcpy(tensor->mutable_data_ptr(), data.data(), total_bytes);
+    }
+
+    return tensor;
+  }
+
+  // Helper to get data from tensor
+  std::vector<float> get_tensor_data(Tensor* tensor) {
+    if (!tensor) {
+      return {};
+    }
+
+    size_t num_elements = tensor->numel();
+    std::vector<float> data(num_elements);
+
+    // Determine if this is a CUDA tensor
+    cudaPointerAttributes attributes{};
+    cudaError_t err = cudaPointerGetAttributes(&attributes, tensor->data_ptr());
+    bool is_device =
+        (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice);
+
+    if (is_device) {
+      cudaError_t memcpy_err = cudaMemcpy(
+          data.data(),
+          tensor->data_ptr(),
+          num_elements * sizeof(float),
+          cudaMemcpyDeviceToHost);
+      // Note: Error is checked but we don't fail the function
+      // This allows tests to proceed and handle errors as needed
+      (void)memcpy_err; // Suppress unused variable warning
+    } else {
+      std::memcpy(
+          data.data(), tensor->data_ptr(), num_elements * sizeof(float));
+    }
+
+    return data;
+  }
+
+  // Helper to verify two tensors have same data
+  bool tensors_equal(Tensor* a, Tensor* b, float tolerance = 1e-6f) {
+    if (!a || !b) {
+      return false;
+    }
+    if (a->numel() != b->numel()) {
+      return false;
+    }
+
+    auto data_a = get_tensor_data(a);
+    auto data_b = get_tensor_data(b);
+
+    for (size_t i = 0; i < data_a.size(); ++i) {
+      if (std::abs(data_a[i] - data_b[i]) > tolerance) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Test basic copy functionality - same schema (fast path)
+TEST_F(AOTITorchCopyTest, BasicCopySameSchema) {
+  // Create source tensor with test data
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  Tensor* src = create_test_tensor_with_data(sizes, src_data);
+  EXPECT_NE(src, nullptr);
+
+  // Create destination tensor with same schema
+  Tensor* dst =
+      create_test_tensor_with_data(sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+  EXPECT_NE(dst, nullptr);
+
+  // Perform copy
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify copy was successful
+  EXPECT_TRUE(tensors_equal(dst, src));
+}
+
+// Test copy with different strides (pointwise fallback)
+TEST_F(AOTITorchCopyTest, CopyDifferentStrides) {
+  // Create source tensor (2x3) with contiguous layout
+  std::vector<int64_t> src_sizes = {2, 3};
+  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  Tensor* src = create_test_tensor_with_data(src_sizes, src_data);
+  EXPECT_NE(src, nullptr);
+
+  // Create destination tensor with transposed strides
+  std::vector<int64_t> dst_strides = {1, 2}; // Column-major layout
+  Tensor* dst = create_test_tensor_with_data(
+      src_sizes, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, dst_strides);
+  EXPECT_NE(dst, nullptr);
+
+  // Perform copy - this should use pointwise fallback
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify the copy worked correctly by checking specific elements
+  auto dst_data = get_tensor_data(dst);
+  auto src_data_check = get_tensor_data(src);
+
+  // For transposed layout, the data should be rearranged
+  EXPECT_EQ(dst_data.size(), 6);
+  EXPECT_EQ(src_data_check.size(), 6);
+}
+
+// Test copy between CPU and CUDA tensors
+TEST_F(AOTITorchCopyTest, CopyCPUToCUDA) {
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+
+  // Create CPU tensor
+  Tensor* cpu_tensor = create_test_tensor_with_data(
+      sizes,
+      data,
+      {},
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CPU)); // CPU
+  EXPECT_NE(cpu_tensor, nullptr);
+
+  // Create CUDA tensor
+  Tensor* cuda_tensor = create_test_tensor_with_data(
+      sizes,
+      {0.0f, 0.0f, 0.0f, 0.0f},
+      {},
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA)); // CUDA
+  EXPECT_NE(cuda_tensor, nullptr);
+
+  // Copy from CPU to CUDA
+  AOTITorchError error = aoti_torch_copy_(cuda_tensor, cpu_tensor, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify copy
+  EXPECT_TRUE(tensors_equal(cuda_tensor, cpu_tensor));
+}
+
+// Test copy between CUDA and CPU tensors
+TEST_F(AOTITorchCopyTest, CopyCUDAToCPU) {
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+
+  // Create CUDA tensor
+  Tensor* cuda_tensor = create_test_tensor_with_data(
+      sizes,
+      data,
+      {},
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA)); // CUDA
+  EXPECT_NE(cuda_tensor, nullptr);
+
+  // Create CPU tensor
+  Tensor* cpu_tensor = create_test_tensor_with_data(
+      sizes,
+      {0.0f, 0.0f, 0.0f, 0.0f},
+      {},
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CPU)); // CPU
+  EXPECT_NE(cpu_tensor, nullptr);
+
+  // Copy from CUDA to CPU
+  AOTITorchError error = aoti_torch_copy_(cpu_tensor, cuda_tensor, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify copy
+  EXPECT_TRUE(tensors_equal(cpu_tensor, cuda_tensor));
+}
+
+// Test copy with bf16 dtype support
+TEST_F(AOTITorchCopyTest, CopyBf16Tensors) {
+  // Test that bf16 tensors can be created and copied
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<float> src_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  // Note: We create float32 data but the tensor will be created with bf16 dtype
+  // This simulates creating bf16 tensors
+  Tensor* src = create_test_tensor_with_data(
+      sizes,
+      src_data,
+      {}, // default strides
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16), // bf16 dtype
+      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
+      0 // device_index = 0
+  );
+  EXPECT_NE(src, nullptr);
+
+  // Create destination tensor with bf16 dtype
+  std::vector<float> dst_init(6, 0.0f);
+  Tensor* dst = create_test_tensor_with_data(
+      sizes,
+      dst_init,
+      {}, // default strides
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16), // bf16 dtype
+      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
+      0 // device_index = 0
+  );
+  EXPECT_NE(dst, nullptr);
+
+  // Perform copy between bf16 tensors
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify that both tensors have the expected dtype
+  int32_t src_dtype, dst_dtype;
+  aoti_torch_get_dtype(src, &src_dtype);
+  aoti_torch_get_dtype(dst, &dst_dtype);
+
+  EXPECT_EQ(src_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
+  EXPECT_EQ(dst_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
+
+  // Verify copy was successful by checking numel matches
+  EXPECT_EQ(src->numel(), dst->numel());
+  EXPECT_EQ(src->numel(), 6);
+}
+
+// Test copy between different dtypes should fail
+TEST_F(AOTITorchCopyTest, CopyDTypeMismatchError) {
+  std::vector<int64_t> sizes = {2, 2};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+
+  // Create float32 tensor
+  Tensor* float32_tensor = create_test_tensor_with_data(
+      sizes,
+      data,
+      {}, // default strides
+      static_cast<int32_t>(SupportedDTypes::FLOAT32), // float32 dtype
+      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
+      0 // device_index = 0
+  );
+  EXPECT_NE(float32_tensor, nullptr);
+
+  // Create bf16 tensor
+  Tensor* bf16_tensor = create_test_tensor_with_data(
+      sizes,
+      {0.0f, 0.0f, 0.0f, 0.0f},
+      {}, // default strides
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16), // bf16 dtype
+      static_cast<int32_t>(SupportedDevices::CUDA), // CUDA device
+      0 // device_index = 0
+  );
+  EXPECT_NE(bf16_tensor, nullptr);
+
+  // Attempting to copy between different dtypes should fail
+  AOTITorchError error = aoti_torch_copy_(bf16_tensor, float32_tensor, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  // Reverse direction should also fail
+  error = aoti_torch_copy_(float32_tensor, bf16_tensor, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test error conditions
+TEST_F(AOTITorchCopyTest, ErrorHandling) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  Tensor* valid_tensor = create_test_tensor_with_data(sizes, data);
+  EXPECT_NE(valid_tensor, nullptr);
+
+  // Test null pointers
+  AOTITorchError error = aoti_torch_copy_(nullptr, valid_tensor, 0);
+  EXPECT_NE(error, Error::Ok);
+
+  error = aoti_torch_copy_(valid_tensor, nullptr, 0);
+  EXPECT_NE(error, Error::Ok);
+
+  // Test numel mismatch (different total number of elements)
+  std::vector<int64_t> different_numel_sizes = {
+      2, 3, 4}; // 24 elements vs 6 elements
+  std::vector<float> different_data(24, 1.0f);
+  Tensor* different_numel =
+      create_test_tensor_with_data(different_numel_sizes, different_data);
+  EXPECT_NE(different_numel, nullptr);
+
+  error = aoti_torch_copy_(valid_tensor, different_numel, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test copy from 1D to 3D with same total elements
+TEST_F(AOTITorchCopyTest, Copy1DTo3DSameNumel) {
+  // Source tensor: 8 elements in 1D
+  std::vector<int64_t> src_sizes = {8};
+  std::vector<float> src_data = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+
+  Tensor* src = create_test_tensor_with_data(src_sizes, src_data);
+  EXPECT_NE(src, nullptr);
+
+  // Destination tensor: 2x2x2 = 8 elements (different shape, same total)
+  std::vector<int64_t> dst_sizes = {2, 2, 2};
+  std::vector<float> dst_init(8, 0.0f);
+  Tensor* dst = create_test_tensor_with_data(dst_sizes, dst_init);
+  EXPECT_NE(dst, nullptr);
+
+  // This should work - same total number of elements
+  AOTITorchError error = aoti_torch_copy_(dst, src, 0);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify the data was copied correctly
+  auto dst_data = get_tensor_data(dst);
+  EXPECT_EQ(dst_data.size(), 8);
+
+  // Check some specific elements to verify correct copying
+  EXPECT_FLOAT_EQ(dst_data[0], 1.0f);
+  EXPECT_FLOAT_EQ(dst_data[7], 8.0f);
+}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
new file mode 100644
index 00000000000..d9b785a5a78
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_create_tensor_from_blob_v2.cpp
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+using executorch::runtime::etensor::Tensor;
+
+// Test fixture for aoti_torch_create_tensor_from_blob_v2 tests
+class AOTITorchCreateTensorFromBlobV2Test : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize ExecuTorch Platform Abstraction Layer
+    et_pal_init();
+
+    // Check if CUDA is available
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    // Clean up any existing cached metadata before each test
+    cleanup_tensor_metadata();
+
+    // Clear any remaining tensors from previous tests
+    clear_all_tensors();
+  }
+
+  void TearDown() override {
+    // Clean up metadata
+    cleanup_tensor_metadata();
+
+    // Clear the global tensor storage using the provided function
+    clear_all_tensors();
+
+    // Clean up any allocated memory buffers
+    for (void* ptr : cuda_memory_buffers_) {
+      if (ptr) {
+        cudaError_t cuda_err = cudaFree(ptr);
+        EXPECT_EQ(cuda_err, cudaSuccess)
+            << "Failed to free CUDA memory: " << cudaGetErrorString(cuda_err);
+      }
+    }
+    cuda_memory_buffers_.clear();
+
+    for (void* ptr : cpu_memory_buffers_) {
+      if (ptr) {
+        free(ptr);
+      }
+    }
+    cpu_memory_buffers_.clear();
+  }
+
+  // Helper to allocate CUDA memory and track it for cleanup
+  void* allocate_cuda_memory(size_t bytes) {
+    void* ptr;
+    cudaError_t err = cudaMallocManaged(&ptr, bytes);
+    if (err == cudaSuccess) {
+      cuda_memory_buffers_.push_back(ptr);
+      return ptr;
+    }
+    return nullptr;
+  }
+
+  // Helper to allocate CPU memory and track it for cleanup
+  void* allocate_cpu_memory(size_t bytes) {
+    void* ptr;
+    int result = posix_memalign(&ptr, 16, bytes); // 16-byte aligned
+    if (result == 0 && ptr != nullptr) {
+      cpu_memory_buffers_.push_back(ptr);
+      return ptr;
+    }
+    return nullptr;
+  }
+
+  // Helper to calculate number of elements from sizes
+  int64_t calculate_numel(const std::vector<int64_t>& sizes) {
+    int64_t numel = 1;
+    for (int64_t size : sizes) {
+      numel *= size;
+    }
+    return numel;
+  }
+
+  // Helper to calculate contiguous strides from sizes
+  std::vector<int64_t> calculate_contiguous_strides(
+      const std::vector<int64_t>& sizes) {
+    std::vector<int64_t> strides(sizes.size());
+    if (sizes.empty()) {
+      return strides;
+    }
+
+    strides[sizes.size() - 1] = 1;
+    // Use int64_t and check for underflow to avoid unsigned integer wraparound
+    for (int64_t i = static_cast<int64_t>(sizes.size()) - 2; i >= 0; i--) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+    return strides;
+  }
+
+ private:
+  std::vector<void*> cuda_memory_buffers_;
+  std::vector<void*> cpu_memory_buffers_;
+};
+
+// Test basic functionality with CUDA memory
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCUDA) {
+  // Test 1D tensor
+  std::vector<int64_t> sizes_1d = {5};
+  std::vector<int64_t> strides_1d = calculate_contiguous_strides(sizes_1d);
+
+  // Allocate CUDA memory
+  size_t bytes = calculate_numel(sizes_1d) * sizeof(float);
+  void* cuda_data = allocate_cuda_memory(bytes);
+  ASSERT_NE(cuda_data, nullptr);
+
+  Tensor* tensor_1d;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      cuda_data,
+      sizes_1d.size(),
+      sizes_1d.data(),
+      strides_1d.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_1d,
+      0, // layout (strided)
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_1d, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor_1d->dim(), 1);
+  EXPECT_EQ(tensor_1d->size(0), 5);
+
+  // Verify the tensor uses the same data pointer
+  void* tensor_data = tensor_1d->mutable_data_ptr();
+  EXPECT_EQ(tensor_data, cuda_data);
+
+  // Delete the tensor - this should NOT free the original memory
+  error = aoti_torch_delete_tensor_object(tensor_1d);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test that the original memory is still accessible (proves tensor didn't own
+  // it) For CUDA memory, check that we can still access it (synchronously)
+  // after tensor deletion
+  float pattern_value = 42.0f;
+  cudaError_t cuda_err = cudaMemcpy(
+      cuda_data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to write to original CUDA memory after tensor deletion";
+
+  float readback_value = 0.0f;
+  cuda_err = cudaMemcpy(
+      &readback_value, cuda_data, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to read from original CUDA memory after tensor deletion";
+  EXPECT_EQ(readback_value, pattern_value)
+      << "Original CUDA memory should still contain our test pattern";
+}
+
+// Test basic functionality with CPU memory
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, BasicFunctionalityCPU) {
+  // Test 2D tensor
+  std::vector<int64_t> sizes_2d = {3, 4};
+  std::vector<int64_t> strides_2d = calculate_contiguous_strides(sizes_2d);
+
+  // Allocate CPU memory
+  size_t bytes = calculate_numel(sizes_2d) * sizeof(float);
+  void* cpu_data = allocate_cpu_memory(bytes);
+  ASSERT_NE(cpu_data, nullptr);
+
+  Tensor* tensor_2d;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      cpu_data,
+      sizes_2d.size(),
+      sizes_2d.data(),
+      strides_2d.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CPU),
+      0, // device index
+      &tensor_2d,
+      0, // layout (strided)
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_2d, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor_2d->dim(), 2);
+  EXPECT_EQ(tensor_2d->size(0), 3);
+  EXPECT_EQ(tensor_2d->size(1), 4);
+
+  // Verify the tensor uses the same data pointer
+  void* tensor_data = tensor_2d->mutable_data_ptr();
+  EXPECT_EQ(tensor_data, cpu_data);
+
+  // Delete the tensor - this should NOT free the original memory
+  error = aoti_torch_delete_tensor_object(tensor_2d);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test that the original memory is still accessible (proves tensor didn't own
+  // it) For CPU memory, directly write and read to verify accessibility
+  float* float_ptr = reinterpret_cast<float*>(cpu_data);
+  float pattern_value = 42.0f;
+  *float_ptr = pattern_value;
+  EXPECT_EQ(*float_ptr, pattern_value)
+      << "Original CPU memory should still be accessible after tensor deletion";
+}
+
+// Test with invalid dtype
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, InvalidDtype) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      999, // invalid dtype
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test with non-zero storage offset (should fail since from_blob cannot handle
+// offsets)
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, NonZeroStorageOffset) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      1, // non-zero storage_offset (should fail since from_blob cannot handle
+         // offsets)
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test with custom strides (using stride parameter but still contiguous)
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, CustomContiguousStrides) {
+  std::vector<int64_t> sizes = {2, 3};
+  // Use the correct contiguous strides but pass them explicitly
+  std::vector<int64_t> contiguous_strides = {3, 1}; // Proper contiguous strides
+
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      contiguous_strides.data(), // Explicitly pass contiguous strides
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+
+  // Verify the tensor uses the same data pointer
+  void* tensor_data = tensor->mutable_data_ptr();
+  EXPECT_EQ(tensor_data, data);
+
+  // Verify strides were properly set (we can check via aoti_torch_get_strides)
+  int64_t* tensor_strides;
+  error = aoti_torch_get_strides(tensor, &tensor_strides);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(tensor_strides[0], 3);
+  EXPECT_EQ(tensor_strides[1], 1);
+
+  // Delete the tensor - this should NOT free the original memory
+  error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test that the original memory is still accessible (proves tensor didn't own
+  // it)
+  float pattern_value = 42.0f;
+  cudaError_t cuda_err =
+      cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to write to original CUDA memory after tensor deletion";
+
+  float readback_value = 0.0f;
+  cuda_err =
+      cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to read from original CUDA memory after tensor deletion";
+  EXPECT_EQ(readback_value, pattern_value)
+      << "Original CUDA memory should still contain our test pattern";
+}
+
+// Test with null data pointer
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, NullDataPointer) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      nullptr, // null data pointer
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test scalar tensor (0D)
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, ScalarTensor) {
+  std::vector<int64_t> sizes = {}; // 0D tensor
+  std::vector<int64_t> strides = {}; // Empty strides for scalar
+
+  size_t bytes = sizeof(float); // Single element
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  Tensor* tensor = nullptr;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 0);
+
+  // Verify the tensor uses the same data pointer
+  void* tensor_data = tensor->mutable_data_ptr();
+  EXPECT_EQ(tensor_data, data);
+
+  // Delete the tensor - this should NOT free the original memory
+  error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test that the original memory is still accessible (proves tensor didn't own
+  // it)
+  float pattern_value = 42.0f;
+  cudaError_t cuda_err =
+      cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to write to original CUDA memory after tensor deletion";
+
+  float readback_value = 0.0f;
+  cuda_err =
+      cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to read from original CUDA memory after tensor deletion";
+  EXPECT_EQ(readback_value, pattern_value)
+      << "Original CUDA memory should still contain our test pattern";
+}
+
+// Test zero-sized tensor
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, ZeroSizedTensor) {
+  std::vector<int64_t> sizes = {0, 5}; // Zero elements
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  // Even for zero-sized tensor, we need some memory allocated
+  size_t bytes = sizeof(float); // Minimum allocation
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 0);
+  EXPECT_EQ(tensor->size(1), 5);
+
+  // Verify the tensor uses the same data pointer
+  void* tensor_data = tensor->mutable_data_ptr();
+  EXPECT_EQ(tensor_data, data);
+
+  // Delete the tensor - this should NOT free the original memory
+  error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test that the original memory is still accessible (proves tensor didn't own
+  // it)
+  float pattern_value = 42.0f;
+  cudaError_t cuda_err =
+      cudaMemcpy(data, &pattern_value, sizeof(float), cudaMemcpyHostToDevice);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to write to original CUDA memory after tensor deletion";
+
+  float readback_value = 0.0f;
+  cuda_err =
+      cudaMemcpy(&readback_value, data, sizeof(float), cudaMemcpyDeviceToHost);
+  EXPECT_EQ(cuda_err, cudaSuccess)
+      << "Should be able to read from original CUDA memory after tensor deletion";
+  EXPECT_EQ(readback_value, pattern_value)
+      << "Original CUDA memory should still contain our test pattern";
+}
+
+// Test multi-dimensional tensors
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultiDimensionalTensors) {
+  // Test 3D tensor
+  std::vector<int64_t> sizes_3d = {2, 3, 4};
+  std::vector<int64_t> strides_3d = calculate_contiguous_strides(sizes_3d);
+
+  size_t bytes_3d = calculate_numel(sizes_3d) * sizeof(float);
+  void* data_3d = allocate_cuda_memory(bytes_3d);
+  ASSERT_NE(data_3d, nullptr);
+
+  Tensor* tensor_3d;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data_3d,
+      sizes_3d.size(),
+      sizes_3d.data(),
+      strides_3d.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_3d,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_3d, nullptr);
+  EXPECT_EQ(tensor_3d->dim(), 3);
+  EXPECT_EQ(tensor_3d->size(0), 2);
+  EXPECT_EQ(tensor_3d->size(1), 3);
+  EXPECT_EQ(tensor_3d->size(2), 4);
+
+  // Test 4D tensor
+  std::vector<int64_t> sizes_4d = {2, 3, 4, 5};
+  std::vector<int64_t> strides_4d = calculate_contiguous_strides(sizes_4d);
+
+  size_t bytes_4d = calculate_numel(sizes_4d) * sizeof(float);
+  void* data_4d = allocate_cuda_memory(bytes_4d);
+  ASSERT_NE(data_4d, nullptr);
+
+  Tensor* tensor_4d;
+  error = aoti_torch_create_tensor_from_blob_v2(
+      data_4d,
+      sizes_4d.size(),
+      sizes_4d.data(),
+      strides_4d.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_4d,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_4d, nullptr);
+  EXPECT_EQ(tensor_4d->dim(), 4);
+  EXPECT_EQ(tensor_4d->size(0), 2);
+  EXPECT_EQ(tensor_4d->size(1), 3);
+  EXPECT_EQ(tensor_4d->size(2), 4);
+  EXPECT_EQ(tensor_4d->size(3), 5);
+}
+
+// Test tensor data pointer consistency
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, DataPointerConsistency) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* original_data = allocate_cuda_memory(bytes);
+  ASSERT_NE(original_data, nullptr);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      original_data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check that the tensor uses the same data pointer
+  void* tensor_data = tensor->mutable_data_ptr();
+  EXPECT_EQ(tensor_data, original_data);
+}
+
+// Test creating multiple tensors from different blobs
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, MultipleTensorsFromBlobs) {
+  const int num_tensors = 5;
+  std::vector<Tensor*> tensors;
+  std::vector<void*> data_ptrs;
+
+  for (int i = 0; i < num_tensors; i++) {
+    std::vector<int64_t> sizes = {i + 1, i + 2};
+    std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+    size_t bytes = calculate_numel(sizes) * sizeof(float);
+    void* data = allocate_cuda_memory(bytes);
+    ASSERT_NE(data, nullptr);
+    data_ptrs.push_back(data);
+
+    Tensor* tensor;
+    AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+        data,
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        0, // storage_offset
+        static_cast<int32_t>(SupportedDTypes::FLOAT32),
+        static_cast<int32_t>(SupportedDevices::CUDA),
+        0, // device index
+        &tensor,
+        0, // layout
+        nullptr, // opaque_metadata
+        0); // opaque_metadata_size
+
+    EXPECT_EQ(error, Error::Ok);
+    EXPECT_NE(tensor, nullptr);
+    tensors.push_back(tensor);
+
+    // Verify dimensions
+    EXPECT_EQ(tensor->dim(), 2);
+    EXPECT_EQ(tensor->size(0), i + 1);
+    EXPECT_EQ(tensor->size(1), i + 2);
+
+    // Verify the tensor uses the correct data pointer
+    EXPECT_EQ(tensor->mutable_data_ptr(), data);
+  }
+
+  // Verify all tensors have different data pointers
+  for (int i = 0; i < num_tensors; i++) {
+    EXPECT_EQ(tensors[i]->mutable_data_ptr(), data_ptrs[i]);
+    for (int j = i + 1; j < num_tensors; j++) {
+      EXPECT_NE(tensors[i]->mutable_data_ptr(), tensors[j]->mutable_data_ptr());
+    }
+  }
+}
+
+// Test deletion of tensor created from blob (should not free the original
+// memory)
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, DeletionDoesNotFreeOriginalMemory) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      nullptr, // opaque_metadata
+      0); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Delete the tensor - this should NOT free the original memory
+  error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // The original memory should still be valid (we'll free it in teardown)
+  // We can't easily test if the memory is still valid without risking crashes,
+  // but the test should pass without issues if memory management is correct
+}
+
+// Test with opaque metadata
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, WithOpaqueMetadata) {
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+  size_t bytes = calculate_numel(sizes) * sizeof(float);
+  void* data = allocate_cuda_memory(bytes);
+  ASSERT_NE(data, nullptr);
+
+  // Create some opaque metadata
+  std::vector<uint8_t> metadata = {0x01, 0x02, 0x03, 0x04};
+
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+      data,
+      sizes.size(),
+      sizes.data(),
+      strides.data(),
+      0, // storage_offset
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor,
+      0, // layout
+      metadata.data(), // opaque_metadata
+      metadata.size()); // opaque_metadata_size
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+}
+
+// Test stress test with many small tensors from blobs
+TEST_F(AOTITorchCreateTensorFromBlobV2Test, StressTestManySmallTensors) {
+  const int num_tensors = 50; // Reduced for reasonable test time
+  std::vector<Tensor*> tensors;
+
+  for (int i = 0; i < num_tensors; i++) {
+    std::vector<int64_t> sizes = {1, 1}; // Minimal size
+    std::vector<int64_t> strides = calculate_contiguous_strides(sizes);
+
+    size_t bytes = calculate_numel(sizes) * sizeof(float);
+    void* data = allocate_cuda_memory(bytes);
+    if (data == nullptr) {
+      // Skip if we run out of memory
+      continue;
+    }
+
+    Tensor* tensor;
+    AOTITorchError error = aoti_torch_create_tensor_from_blob_v2(
+        data,
+        sizes.size(),
+        sizes.data(),
+        strides.data(),
+        0, // storage_offset
+        static_cast<int32_t>(SupportedDTypes::FLOAT32),
+        static_cast<int32_t>(SupportedDevices::CUDA),
+        0, // device index
+        &tensor,
+        0, // layout
+        nullptr, // opaque_metadata
+        0); // opaque_metadata_size
+
+    if (error == Error::Ok && tensor != nullptr) {
+      tensors.push_back(tensor);
+
+      // Verify the tensor uses the correct data pointer
+      EXPECT_EQ(tensor->mutable_data_ptr(), data);
+    }
+  }
+
+  // Delete all created tensors
+  for (Tensor* tensor : tensors) {
+    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+    EXPECT_EQ(error, Error::Ok);
+  }
+}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp
new file mode 100644
index 00000000000..7527965cdb8
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_cuda_guard.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/cuda_guard.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. Will be added in the future.
+class AOTITorchCUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (cudaGetDeviceCount(&original_device_) == cudaSuccess) {
+      ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+    }
+  }
+
+  int original_device_ = 0;
+};
+
+TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAGuard) {
+  CUDAGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  int current_device = -1;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+
+  error = aoti_torch_delete_cuda_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAGuardNullReturnPointer) {
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAGuardNullHandle) {
+  AOTITorchError error = aoti_torch_delete_cuda_guard(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexNullHandle) {
+  AOTITorchError error = aoti_torch_cuda_guard_set_index(nullptr, 0);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CUDAGuardSetIndexInvalidDevice) {
+  CUDAGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_guard(0, &guard);
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  error = aoti_torch_cuda_guard_set_index(guard, 999);
+  EXPECT_NE(error, Error::Ok);
+
+  error = aoti_torch_delete_cuda_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateAndDeleteCUDAStreamGuard) {
+  cudaStream_t stream;
+  ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess);
+
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error = aoti_torch_create_cuda_stream_guard(stream, 0, &guard);
+
+  EXPECT_EQ(error, Error::Ok);
+  ASSERT_NE(guard, nullptr);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullReturnPointer) {
+  cudaStream_t stream;
+  ASSERT_EQ(cudaStreamCreate(&stream), cudaSuccess);
+
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(stream, 0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+
+  ASSERT_EQ(cudaStreamDestroy(stream), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, CreateCUDAStreamGuardNullStream) {
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(nullptr, 0, &guard);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, DeleteCUDAStreamGuardNullHandle) {
+  AOTITorchError error = aoti_torch_delete_cuda_stream_guard(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStream) {
+  void* ret_stream = nullptr;
+  AOTITorchError error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(ret_stream, nullptr);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentCUDAStreamNullReturnPointer) {
+  AOTITorchError error = aoti_torch_get_current_cuda_stream(0, nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, StreamGuardWithSameDevice) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  cudaStream_t stream1, stream2;
+  ASSERT_EQ(cudaStreamCreate(&stream1), cudaSuccess);
+  ASSERT_EQ(cudaStreamCreate(&stream2), cudaSuccess);
+
+  CUDAStreamGuardHandle guard1 = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(stream1, 0, &guard1);
+  EXPECT_EQ(error, Error::Ok);
+
+  void* ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream1);
+
+  CUDAStreamGuardHandle guard2 = nullptr;
+  error = aoti_torch_create_cuda_stream_guard(stream2, 0, &guard2);
+  EXPECT_EQ(error, Error::Ok);
+
+  ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream2);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard2);
+  EXPECT_EQ(error, Error::Ok);
+
+  ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), stream1);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard1);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(stream1), cudaSuccess);
+  ASSERT_EQ(cudaStreamDestroy(stream2), cudaSuccess);
+}
+
+TEST_F(AOTITorchCUDAGuardTest, GetCurrentStreamAfterSetStream) {
+  cudaStream_t new_stream;
+  ASSERT_EQ(cudaStreamCreate(&new_stream), cudaSuccess);
+
+  CUDAStreamGuardHandle guard = nullptr;
+  AOTITorchError error =
+      aoti_torch_create_cuda_stream_guard(new_stream, 0, &guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  void* ret_stream = nullptr;
+  error = aoti_torch_get_current_cuda_stream(0, &ret_stream);
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_EQ(static_cast<cudaStream_t>(ret_stream), new_stream);
+
+  error = aoti_torch_delete_cuda_stream_guard(guard);
+  EXPECT_EQ(error, Error::Ok);
+
+  ASSERT_EQ(cudaStreamDestroy(new_stream), cudaSuccess);
+}
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
new file mode 100644
index 00000000000..10c8d8c1a31
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_delete_tensor_object.cpp
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace executorch::backends::aoti;
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+using executorch::runtime::etensor::Tensor;
+
+// Test fixture for aoti_torch_delete_tensor_object tests
+class AOTITorchDeleteTensorObjectTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize ExecuTorch Platform Abstraction Layer
+    et_pal_init();
+
+    // Check if CUDA is available
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    // Clean up any existing cached metadata before each test
+    cleanup_tensor_metadata();
+
+    // Clear any remaining tensors from previous tests
+    clear_all_tensors();
+  }
+
+  void TearDown() override {
+    // Clean up metadata
+    cleanup_tensor_metadata();
+
+    // Clear the global tensor storage using the provided function
+    clear_all_tensors();
+  }
+
+  // Helper to create test tensors
+  Tensor* create_test_tensor(
+      const std::vector<int64_t>& sizes,
+      const std::vector<int64_t>& strides = {},
+      int32_t dtype = 6, // float32
+      int32_t device_type = 1, // CUDA
+      int32_t device_index = 0) {
+    Tensor* tensor;
+
+    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides_ptr,
+        dtype,
+        device_type,
+        device_index,
+        &tensor);
+
+    return (error == Error::Ok) ? tensor : nullptr;
+  }
+};
+
+// Test basic deletion of CUDA tensor
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCudaTensorBasic) {
+  // Create a CUDA tensor
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device
+  ASSERT_NE(tensor, nullptr);
+
+  // Verify tensor properties before deletion
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test basic deletion of CPU tensor
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteCpuTensorBasic) {
+  // Create a CPU tensor
+  std::vector<int64_t> sizes = {3, 4};
+  Tensor* tensor = create_test_tensor(sizes, {}, 6, 0, 0); // CPU device
+  ASSERT_NE(tensor, nullptr);
+
+  // Verify tensor properties before deletion
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test deletion of null tensor pointer
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteNullTensor) {
+  AOTITorchError error = aoti_torch_delete_tensor_object(nullptr);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test deletion of tensor not in tracking system
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteUntrackedTensor) {
+  // Create a tensor and then clear the tracking system
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = create_test_tensor(sizes);
+  ASSERT_NE(tensor, nullptr);
+
+  // Clear the tracking system (simulating an untracked tensor)
+  clear_all_tensors();
+
+  // Try to delete the tensor - should fail
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test deletion of multiple tensors
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMultipleTensors) {
+  // Create multiple tensors
+  std::vector<Tensor*> tensors;
+
+  for (int i = 1; i <= 5; i++) {
+    std::vector<int64_t> sizes = {i, i + 1};
+    Tensor* tensor = create_test_tensor(sizes);
+    ASSERT_NE(tensor, nullptr);
+    tensors.push_back(tensor);
+  }
+
+  // Delete all tensors
+  for (Tensor* tensor : tensors) {
+    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+    EXPECT_EQ(error, Error::Ok);
+  }
+}
+
+// Test deletion of zero-sized tensors
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteZeroSizedTensor) {
+  // Create a zero-sized tensor
+  std::vector<int64_t> sizes = {0, 5};
+  Tensor* tensor = create_test_tensor(sizes);
+  ASSERT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 0);
+  EXPECT_EQ(tensor->size(1), 5);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test deletion of scalar (0D) tensors
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteScalarTensor) {
+  // Create a scalar tensor
+  std::vector<int64_t> sizes = {};
+  Tensor* tensor = create_test_tensor(sizes);
+  ASSERT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 0);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test deletion of large multi-dimensional tensors
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteLargeTensor) {
+  // Create a large multi-dimensional tensor
+  std::vector<int64_t> sizes = {10, 20, 30};
+  Tensor* tensor = create_test_tensor(sizes);
+  ASSERT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 10);
+  EXPECT_EQ(tensor->size(1), 20);
+  EXPECT_EQ(tensor->size(2), 30);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test deletion of tensors with custom strides
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteTensorWithCustomStrides) {
+  // Create tensor with custom strides
+  std::vector<int64_t> sizes = {3, 4};
+  std::vector<int64_t> strides = {4, 1}; // Row-major strides
+  Tensor* tensor = create_test_tensor(sizes, strides);
+  ASSERT_NE(tensor, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test deletion after accessing tensor data
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteAfterDataAccess) {
+  // Create a tensor
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = create_test_tensor(sizes);
+  ASSERT_NE(tensor, nullptr);
+
+  // Access tensor data (this should not prevent deletion)
+  void* data_ptr = tensor->mutable_data_ptr();
+  EXPECT_NE(data_ptr, nullptr);
+
+  // Delete the tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test double deletion (should fail on second attempt)
+TEST_F(AOTITorchDeleteTensorObjectTest, DoubleDeletion) {
+  // Create a tensor
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor = create_test_tensor(sizes);
+  ASSERT_NE(tensor, nullptr);
+
+  // First deletion should succeed
+  AOTITorchError error1 = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error1, Error::Ok);
+
+  // Second deletion should fail (tensor no longer tracked)
+  AOTITorchError error2 = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error2, Error::InvalidArgument);
+}
+
+// Test deletion of tensors on both CUDA and CPU devices
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteMixedDeviceTensors) {
+  // Create CUDA tensor
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* cuda_tensor = create_test_tensor(sizes, {}, 6, 1, 0);
+  ASSERT_NE(cuda_tensor, nullptr);
+
+  // Create CPU tensor
+  Tensor* cpu_tensor = create_test_tensor(sizes, {}, 6, 0, 0);
+  ASSERT_NE(cpu_tensor, nullptr);
+
+  // Delete both tensors
+  AOTITorchError cuda_error = aoti_torch_delete_tensor_object(cuda_tensor);
+  EXPECT_EQ(cuda_error, Error::Ok);
+
+  AOTITorchError cpu_error = aoti_torch_delete_tensor_object(cpu_tensor);
+  EXPECT_EQ(cpu_error, Error::Ok);
+}
+
+// Test memory consistency after deletion
+TEST_F(AOTITorchDeleteTensorObjectTest, MemoryConsistencyAfterDeletion) {
+  // Create multiple tensors
+  std::vector<Tensor*> tensors;
+  const int num_tensors = 10;
+
+  for (int i = 0; i < num_tensors; i++) {
+    std::vector<int64_t> sizes = {i + 1, i + 2};
+    Tensor* tensor = create_test_tensor(sizes);
+    ASSERT_NE(tensor, nullptr);
+    tensors.push_back(tensor);
+  }
+
+  // Delete every other tensor
+  for (int i = 0; i < num_tensors; i += 2) {
+    AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]);
+    EXPECT_EQ(error, Error::Ok);
+  }
+
+  // Delete remaining tensors
+  for (int i = 1; i < num_tensors; i += 2) {
+    AOTITorchError error = aoti_torch_delete_tensor_object(tensors[i]);
+    EXPECT_EQ(error, Error::Ok);
+  }
+}
+
+// Test stress deletion with many small tensors
+TEST_F(AOTITorchDeleteTensorObjectTest, StressDeletionManySmallTensors) {
+  const int num_tensors = 100;
+  std::vector<Tensor*> tensors;
+
+  // Create many small tensors
+  for (int i = 0; i < num_tensors; i++) {
+    std::vector<int64_t> sizes = {1, 1}; // Minimal size
+    Tensor* tensor = create_test_tensor(sizes);
+    if (tensor != nullptr) {
+      tensors.push_back(tensor);
+    }
+  }
+
+  // Delete all created tensors
+  for (Tensor* tensor : tensors) {
+    AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+    EXPECT_EQ(error, Error::Ok);
+  }
+}
+
+// Test CUDA synchronization during deletion
+TEST_F(AOTITorchDeleteTensorObjectTest, CudaSynchronizationDuringDeletion) {
+  // Create a larger CUDA tensor to ensure memory allocation
+  std::vector<int64_t> sizes = {100, 100};
+  Tensor* tensor = create_test_tensor(sizes, {}, 6, 1, 0); // CUDA device
+  ASSERT_NE(tensor, nullptr);
+
+  // Delete the tensor (should handle synchronization internally)
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Verify CUDA state is still good
+  cudaError_t cuda_error = cudaGetLastError();
+  EXPECT_EQ(cuda_error, cudaSuccess);
+}
+
+// Test specific deletion of bfloat16 tensors
+TEST_F(AOTITorchDeleteTensorObjectTest, DeleteBFloat16Tensor) {
+  // Test 1D bfloat16 tensor deletion
+  std::vector<int64_t> sizes_1d = {10};
+  Tensor* tensor_bf16_1d = create_test_tensor(
+      sizes_1d,
+      {},
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      1, // CUDA device
+      0);
+  ASSERT_NE(tensor_bf16_1d, nullptr);
+
+  // Verify it's bfloat16 before deletion
+  int32_t actual_dtype;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_1d, &actual_dtype), Error::Ok);
+  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
+      << "Expected bfloat16 dtype ("
+      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
+      << actual_dtype;
+
+  // Verify element size (bfloat16 should be 2 bytes per element)
+  EXPECT_EQ(tensor_bf16_1d->element_size(), 2);
+
+  // Delete the bfloat16 tensor
+  AOTITorchError error = aoti_torch_delete_tensor_object(tensor_bf16_1d);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test 2D bfloat16 tensor deletion with custom strides
+  std::vector<int64_t> sizes_2d = {4, 6};
+  std::vector<int64_t> strides_2d = {6, 1}; // Row-major strides
+  Tensor* tensor_bf16_2d = create_test_tensor(
+      sizes_2d,
+      strides_2d,
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      1, // CUDA device
+      0);
+  ASSERT_NE(tensor_bf16_2d, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor_bf16_2d->dim(), 2);
+  EXPECT_EQ(tensor_bf16_2d->size(0), 4);
+  EXPECT_EQ(tensor_bf16_2d->size(1), 6);
+  EXPECT_EQ(tensor_bf16_2d->element_size(), 2);
+
+  // Verify it's bfloat16
+  int32_t dtype_2d;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_2d, &dtype_2d), Error::Ok);
+  EXPECT_EQ(dtype_2d, static_cast<int32_t>(SupportedDTypes::BFLOAT16));
+
+  // Delete the 2D bfloat16 tensor
+  error = aoti_torch_delete_tensor_object(tensor_bf16_2d);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test 3D bfloat16 tensor deletion
+  std::vector<int64_t> sizes_3d = {2, 3, 4};
+  Tensor* tensor_bf16_3d = create_test_tensor(
+      sizes_3d,
+      {},
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      1, // CUDA device
+      0);
+  ASSERT_NE(tensor_bf16_3d, nullptr);
+
+  // Verify tensor properties
+  EXPECT_EQ(tensor_bf16_3d->dim(), 3);
+  EXPECT_EQ(tensor_bf16_3d->size(0), 2);
+  EXPECT_EQ(tensor_bf16_3d->size(1), 3);
+  EXPECT_EQ(tensor_bf16_3d->size(2), 4);
+  EXPECT_EQ(tensor_bf16_3d->element_size(), 2);
+
+  // Verify memory size (2 * 3 * 4 * 2 bytes = 48 bytes)
+  size_t expected_memory = 2 * 3 * 4 * 2;
+  size_t actual_memory =
+      tensor_bf16_3d->numel() * tensor_bf16_3d->element_size();
+  EXPECT_EQ(actual_memory, expected_memory);
+
+  // Delete the 3D bfloat16 tensor
+  error = aoti_torch_delete_tensor_object(tensor_bf16_3d);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test bfloat16 scalar tensor (0D) deletion
+  std::vector<int64_t> scalar_sizes = {};
+  Tensor* tensor_bf16_scalar = create_test_tensor(
+      scalar_sizes,
+      {},
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      1, // CUDA device
+      0);
+  ASSERT_NE(tensor_bf16_scalar, nullptr);
+
+  // Verify scalar tensor properties
+  EXPECT_EQ(tensor_bf16_scalar->dim(), 0);
+  EXPECT_EQ(tensor_bf16_scalar->numel(), 1);
+  EXPECT_EQ(tensor_bf16_scalar->element_size(), 2);
+
+  // Delete the scalar bfloat16 tensor
+  error = aoti_torch_delete_tensor_object(tensor_bf16_scalar);
+  EXPECT_EQ(error, Error::Ok);
+
+  // Test zero-element bfloat16 tensor deletion
+  std::vector<int64_t> zero_sizes = {0, 5};
+  Tensor* tensor_bf16_zero = create_test_tensor(
+      zero_sizes,
+      {},
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      1, // CUDA device
+      0);
+  ASSERT_NE(tensor_bf16_zero, nullptr);
+
+  // Verify zero-element tensor properties
+  EXPECT_EQ(tensor_bf16_zero->dim(), 2);
+  EXPECT_EQ(tensor_bf16_zero->size(0), 0);
+  EXPECT_EQ(tensor_bf16_zero->size(1), 5);
+  EXPECT_EQ(tensor_bf16_zero->numel(), 0);
+  EXPECT_EQ(tensor_bf16_zero->element_size(), 2);
+
+  // Delete the zero-element bfloat16 tensor
+  error = aoti_torch_delete_tensor_object(tensor_bf16_zero);
+  EXPECT_EQ(error, Error::Ok);
+}
+
+// Test deletion of mixed dtype tensors (float32 and bfloat16)
diff --git a/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
new file mode 100644
index 00000000000..da65129f18a
--- /dev/null
+++ b/backends/cuda/runtime/shims/tests/test_aoti_torch_empty_strided.cpp
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::backends::aoti;
+using namespace executorch::runtime;
+using executorch::runtime::etensor::Tensor;
+
+// Test fixture for aoti_torch_empty_strided tests
+class AOTITorchEmptyStridedTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Initialize ExecuTorch Platform Abstraction Layer
+    et_pal_init();
+
+    // Check if CUDA is available
+    int device_count = 0;
+    cudaError_t err = cudaGetDeviceCount(&device_count);
+    if (err != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available, skipping CUDA tests";
+    }
+
+    // Clean up any existing cached metadata before each test
+    cleanup_tensor_metadata();
+
+    // Clear any remaining tensors from previous tests
+    clear_all_tensors();
+  }
+
+  void TearDown() override {
+    // Clean up metadata
+    cleanup_tensor_metadata();
+
+    // Clear the global tensor storage using the provided function
+    clear_all_tensors();
+  }
+
+  // Helper to create test tensors
+  Tensor* create_tracked_tensor(
+      const std::vector<int64_t>& sizes,
+      const std::vector<int64_t>& strides = {},
+      int32_t dtype = static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      int32_t device_type = static_cast<int32_t>(SupportedDevices::CUDA),
+      int32_t device_index = 0) {
+    Tensor* tensor;
+
+    const int64_t* strides_ptr = strides.empty() ? nullptr : strides.data();
+
+    AOTITorchError error = aoti_torch_empty_strided(
+        sizes.size(),
+        sizes.data(),
+        strides_ptr,
+        dtype,
+        device_type,
+        device_index,
+        &tensor);
+
+    return (error == Error::Ok) ? tensor : nullptr;
+  }
+};
+
+// Test aoti_torch_empty_strided basic functionality
+TEST_F(AOTITorchEmptyStridedTest, BasicFunctionality) {
+  // Test 1D tensor
+  std::vector<int64_t> sizes_1d = {5};
+  Tensor* tensor_1d;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes_1d.size(),
+      sizes_1d.data(),
+      nullptr, // Let function compute strides
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_1d);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_1d, nullptr);
+
+  // CRITICAL: Verify the tensor is actually float32
+  int32_t actual_dtype;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_1d, &actual_dtype), Error::Ok);
+  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::FLOAT32))
+      << "Expected float32 dtype ("
+      << static_cast<int32_t>(SupportedDTypes::FLOAT32) << "), got "
+      << actual_dtype;
+
+  // Verify element size (float32 should be 4 bytes per element)
+  size_t element_size = tensor_1d->element_size();
+  EXPECT_EQ(element_size, 4)
+      << "Expected float32 element size to be 4 bytes, got " << element_size;
+
+  // Verify total number of elements and memory usage
+  int64_t expected_numel = 5; // 5 elements
+  EXPECT_EQ(tensor_1d->numel(), expected_numel)
+      << "Expected " << expected_numel << " elements, got "
+      << tensor_1d->numel();
+
+  // Verify total memory size (numel * element_size)
+  size_t expected_memory_size = expected_numel * 4; // 5 * 4 = 20 bytes
+  size_t actual_memory_size = tensor_1d->numel() * tensor_1d->element_size();
+  EXPECT_EQ(actual_memory_size, expected_memory_size)
+      << "Expected " << expected_memory_size << " bytes, got "
+      << actual_memory_size;
+
+  // Check tensor properties
+  EXPECT_EQ(tensor_1d->dim(), 1);
+  EXPECT_EQ(tensor_1d->size(0), 5);
+
+  // Test 2D tensor with explicit strides
+  std::vector<int64_t> sizes_2d = {3, 4};
+  std::vector<int64_t> strides_2d = {4, 1};
+  Tensor* tensor_2d;
+  error = aoti_torch_empty_strided(
+      sizes_2d.size(),
+      sizes_2d.data(),
+      strides_2d.data(),
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_2d);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_2d, nullptr);
+
+  // Verify 2D tensor is also float32
+  int32_t dtype_2d;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_2d, &dtype_2d), Error::Ok);
+  EXPECT_EQ(dtype_2d, static_cast<int32_t>(SupportedDTypes::FLOAT32))
+      << "Expected float32 dtype ("
+      << static_cast<int32_t>(SupportedDTypes::FLOAT32) << "), got "
+      << dtype_2d;
+
+  // Verify element size for 2D tensor
+  EXPECT_EQ(tensor_2d->element_size(), 4);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor_2d->dim(), 2);
+  EXPECT_EQ(tensor_2d->size(0), 3);
+  EXPECT_EQ(tensor_2d->size(1), 4);
+
+  // Verify memory size for 2D tensor
+  int64_t expected_numel_2d = 3 * 4; // 12 elements
+  size_t expected_memory_2d = expected_numel_2d * 4; // 12 * 4 = 48 bytes
+  EXPECT_EQ(tensor_2d->numel() * tensor_2d->element_size(), expected_memory_2d);
+}
+
+// Test aoti_torch_empty_strided with CPU device
+TEST_F(AOTITorchEmptyStridedTest, CPUDevice) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr, // Let function compute strides
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDevices::CPU),
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+}
+
+// Test aoti_torch_empty_strided with invalid dtype
+TEST_F(AOTITorchEmptyStridedTest, InvalidDtype) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      999, // invalid dtype
+      1, // CUDA device
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::InvalidArgument);
+}
+
+// Test aoti_torch_empty_strided with unsupported device
+TEST_F(AOTITorchEmptyStridedTest, UnsupportedDevice) {
+  std::vector<int64_t> sizes = {2, 3};
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      6, // float32
+      2, // unsupported device type
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::NotImplemented);
+}
+
+// Test aoti_torch_empty_strided with zero-sized tensor
+TEST_F(AOTITorchEmptyStridedTest, ZeroSized) {
+  std::vector<int64_t> sizes = {0, 5};
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      6, // float32
+      1, // CUDA device
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 0);
+  EXPECT_EQ(tensor->size(1), 5);
+}
+
+// Test aoti_torch_empty_strided scalar tensor (0D)
+TEST_F(AOTITorchEmptyStridedTest, Scalar) {
+  std::vector<int64_t> sizes = {};
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      6, // float32
+      1, // CUDA device
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 0);
+}
+
+// Test aoti_torch_empty_strided with large tensor
+TEST_F(AOTITorchEmptyStridedTest, LargeTensor) {
+  std::vector<int64_t> sizes = {100, 200, 50};
+  Tensor* tensor;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      6, // float32
+      1, // CUDA device
+      0, // device index
+      &tensor);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor, nullptr);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 100);
+  EXPECT_EQ(tensor->size(1), 200);
+  EXPECT_EQ(tensor->size(2), 50);
+}
+
+// Test error handling with memory allocation failures
+TEST_F(AOTITorchEmptyStridedTest, MemoryAllocationStress) {
+  // Try to create a very large tensor that might cause allocation failure
+  // (This test may pass or fail depending on available memory)
+  std::vector<int64_t> huge_sizes = {10000, 10000, 100}; // ~38GB for float32
+  Tensor* tensor;
+
+  AOTITorchError error = aoti_torch_empty_strided(
+      huge_sizes.size(),
+      huge_sizes.data(),
+      nullptr,
+      6, // float32
+      1, // CUDA device
+      0, // device index
+      &tensor);
+
+  // Either succeed or fail with memory allocation error
+  if (error == Error::Ok) {
+    EXPECT_NE(tensor, nullptr);
+  } else {
+    EXPECT_EQ(error, Error::MemoryAllocationFailed);
+  }
+}
+
+// Test aoti_torch_empty_strided with bfloat16 dtype
+TEST_F(AOTITorchEmptyStridedTest, BFloat16Tensor) {
+  // Test creating bfloat16 tensor on CUDA
+  std::vector<int64_t> sizes = {2, 3, 4};
+  Tensor* tensor_bf16;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr, // Let function compute strides
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_bf16);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_bf16, nullptr);
+
+  // CRITICAL: Verify the tensor is actually bfloat16
+  int32_t actual_dtype;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16, &actual_dtype), Error::Ok);
+  EXPECT_EQ(actual_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
+      << "Expected bfloat16 dtype ("
+      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
+      << actual_dtype;
+
+  // Verify element size (bfloat16 should be 2 bytes per element)
+  size_t element_size = tensor_bf16->element_size();
+  EXPECT_EQ(element_size, 2)
+      << "Expected bfloat16 element size to be 2 bytes, got " << element_size;
+
+  // Verify total number of elements and memory usage
+  int64_t expected_numel = 2 * 3 * 4; // 24 elements
+  EXPECT_EQ(tensor_bf16->numel(), expected_numel)
+      << "Expected " << expected_numel << " elements, got "
+      << tensor_bf16->numel();
+
+  // Verify total memory size (numel * element_size)
+  size_t expected_memory_size = expected_numel * 2; // 24 * 2 = 48 bytes
+  size_t actual_memory_size =
+      tensor_bf16->numel() * tensor_bf16->element_size();
+  EXPECT_EQ(actual_memory_size, expected_memory_size)
+      << "Expected " << expected_memory_size << " bytes, got "
+      << actual_memory_size;
+
+  // Check tensor properties
+  EXPECT_EQ(tensor_bf16->dim(), 3);
+  EXPECT_EQ(tensor_bf16->size(0), 2);
+  EXPECT_EQ(tensor_bf16->size(1), 3);
+  EXPECT_EQ(tensor_bf16->size(2), 4);
+
+  // Verify we can get tensor metadata
+  int64_t* sizes_ptr;
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor_bf16, &sizes_ptr), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor_bf16, &strides_ptr), Error::Ok);
+
+  // Check sizes match
+  EXPECT_EQ(sizes_ptr[0], 2);
+  EXPECT_EQ(sizes_ptr[1], 3);
+  EXPECT_EQ(sizes_ptr[2], 4);
+
+  // Check that strides are computed correctly (row-major order)
+  EXPECT_EQ(strides_ptr[0], 12); // 3 * 4
+  EXPECT_EQ(strides_ptr[1], 4); // 4
+  EXPECT_EQ(strides_ptr[2], 1); // 1
+
+  // Test bfloat16 tensor with custom strides
+  std::vector<int64_t> sizes_2d = {3, 2};
+  std::vector<int64_t> strides_2d = {2, 1}; // Row-major strides
+  Tensor* tensor_bf16_custom;
+  error = aoti_torch_empty_strided(
+      sizes_2d.size(),
+      sizes_2d.data(),
+      strides_2d.data(),
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_bf16_custom);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_bf16_custom, nullptr);
+
+  // Verify custom stride tensor is also bfloat16
+  int32_t custom_dtype;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_custom, &custom_dtype), Error::Ok);
+  EXPECT_EQ(custom_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
+      << "Expected bfloat16 dtype ("
+      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
+      << custom_dtype;
+
+  // Verify element size for custom stride tensor
+  EXPECT_EQ(tensor_bf16_custom->element_size(), 2);
+
+  // Check tensor properties
+  EXPECT_EQ(tensor_bf16_custom->dim(), 2);
+  EXPECT_EQ(tensor_bf16_custom->size(0), 3);
+  EXPECT_EQ(tensor_bf16_custom->size(1), 2);
+
+  // Verify memory size for custom stride tensor
+  int64_t custom_expected_numel = 3 * 2; // 6 elements
+  size_t custom_expected_memory = custom_expected_numel * 2; // 6 * 2 = 12 bytes
+  EXPECT_EQ(
+      tensor_bf16_custom->numel() * tensor_bf16_custom->element_size(),
+      custom_expected_memory);
+
+  // Check custom strides
+  int64_t* custom_strides_ptr;
+  EXPECT_EQ(
+      aoti_torch_get_strides(tensor_bf16_custom, &custom_strides_ptr),
+      Error::Ok);
+  EXPECT_EQ(custom_strides_ptr[0], 2);
+  EXPECT_EQ(custom_strides_ptr[1], 1);
+
+  // Test bfloat16 scalar tensor (0D)
+  std::vector<int64_t> scalar_sizes = {};
+  Tensor* tensor_bf16_scalar;
+  error = aoti_torch_empty_strided(
+      scalar_sizes.size(),
+      scalar_sizes.data(),
+      nullptr,
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16),
+      static_cast<int32_t>(SupportedDevices::CUDA),
+      0, // device index
+      &tensor_bf16_scalar);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_bf16_scalar, nullptr);
+  EXPECT_EQ(tensor_bf16_scalar->dim(), 0);
+
+  // Verify scalar tensor is also bfloat16
+  int32_t scalar_dtype;
+  EXPECT_EQ(aoti_torch_get_dtype(tensor_bf16_scalar, &scalar_dtype), Error::Ok);
+  EXPECT_EQ(scalar_dtype, static_cast<int32_t>(SupportedDTypes::BFLOAT16))
+      << "Expected bfloat16 dtype ("
+      << static_cast<int32_t>(SupportedDTypes::BFLOAT16) << "), got "
+      << scalar_dtype;
+
+  // Verify scalar tensor properties
+  EXPECT_EQ(tensor_bf16_scalar->element_size(), 2);
+  EXPECT_EQ(tensor_bf16_scalar->numel(), 1); // Scalar tensor has 1 element
+  EXPECT_EQ(
+      tensor_bf16_scalar->numel() * tensor_bf16_scalar->element_size(),
+      2); // 1 * 2 = 2 bytes
+}
+
+// Test custom strides functionality
+TEST_F(AOTITorchEmptyStridedTest, CustomStrides) {
+  // Create tensor with valid custom strides (contiguous layout)
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1}; // Standard row-major strides
+
+  Tensor* tensor = create_tracked_tensor(sizes, strides);
+  EXPECT_NE(tensor, nullptr);
+
+  // Verify the tensor was created correctly
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+
+  // Check strides through AOTI interface
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
+  EXPECT_EQ(strides_ptr[0], 3);
+  EXPECT_EQ(strides_ptr[1], 1);
+
+  // Test another valid stride pattern - transpose-like
+  std::vector<int64_t> sizes_2 = {3, 2};
+  std::vector<int64_t> strides_2 = {1, 3}; // Column-major strides
+
+  Tensor* tensor_2 = create_tracked_tensor(sizes_2, strides_2);
+  EXPECT_NE(tensor_2, nullptr);
+
+  // Verify the tensor properties
+  EXPECT_EQ(tensor_2->dim(), 2);
+  EXPECT_EQ(tensor_2->size(0), 3);
+  EXPECT_EQ(tensor_2->size(1), 2);
+
+  // Check strides
+  int64_t* strides_ptr_2;
+  EXPECT_EQ(aoti_torch_get_strides(tensor_2, &strides_ptr_2), Error::Ok);
+  EXPECT_EQ(strides_ptr_2[0], 1);
+  EXPECT_EQ(strides_ptr_2[1], 3);
+}
+
+// Test edge case: zero-element tensor with non-zero dimensions
+TEST_F(AOTITorchEmptyStridedTest, ZeroElementTensor) {
+  std::vector<int64_t> sizes = {2, 0, 3}; // Total elements = 0
+  Tensor* tensor = create_tracked_tensor(sizes);
+  EXPECT_NE(tensor, nullptr);
+
+  // Verify the tensor properties
+  EXPECT_EQ(tensor->dim(), 3);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 0);
+  EXPECT_EQ(tensor->size(2), 3);
+
+  // Should be able to get metadata
+  int64_t* sizes_ptr;
+  int64_t* strides_ptr;
+  EXPECT_EQ(aoti_torch_get_sizes(tensor, &sizes_ptr), Error::Ok);
+  EXPECT_EQ(aoti_torch_get_strides(tensor, &strides_ptr), Error::Ok);
+
+  EXPECT_EQ(sizes_ptr[0], 2);
+  EXPECT_EQ(sizes_ptr[1], 0);
+  EXPECT_EQ(sizes_ptr[2], 3);
+}
+
+// Test different data types (only float32 is currently supported)
+TEST_F(AOTITorchEmptyStridedTest, DifferentDataTypes) {
+  std::vector<int64_t> sizes = {2, 3};
+
+  // Test float32 (dtype 6) - currently the only supported type
+  Tensor* tensor_float32;
+  AOTITorchError error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      6, // float32
+      1, // CUDA device
+      0, // device index
+      &tensor_float32);
+
+  EXPECT_EQ(error, Error::Ok);
+  EXPECT_NE(tensor_float32, nullptr);
+
+  // Test unsupported data types should return error
+  Tensor* tensor_int32;
+  error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      3, // int32 - unsupported
+      1, // CUDA device
+      0, // device index
+      &tensor_int32);
+
+  EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype
+
+  // Test another unsupported data type
+  Tensor* tensor_float64;
+  error = aoti_torch_empty_strided(
+      sizes.size(),
+      sizes.data(),
+      nullptr,
+      7, // float64 - unsupported
+      1, // CUDA device
+      0, // device index
+      &tensor_float64);
+
+  EXPECT_EQ(error, Error::InvalidArgument); // Should fail for unsupported dtype
+}
+
+// Test multi-dimensional tensors with various shapes
+TEST_F(AOTITorchEmptyStridedTest, MultiDimensionalTensors) {
+  // Test 3D tensor
+  std::vector<int64_t> sizes_3d = {2, 3, 4};
+  Tensor* tensor_3d = create_tracked_tensor(sizes_3d);
+  EXPECT_NE(tensor_3d, nullptr);
+  EXPECT_EQ(tensor_3d->dim(), 3);
+  EXPECT_EQ(tensor_3d->size(0), 2);
+  EXPECT_EQ(tensor_3d->size(1), 3);
+  EXPECT_EQ(tensor_3d->size(2), 4);
+
+  // Test 4D tensor
+  std::vector<int64_t> sizes_4d = {2, 3, 4, 5};
+  Tensor* tensor_4d = create_tracked_tensor(sizes_4d);
+  EXPECT_NE(tensor_4d, nullptr);
+  EXPECT_EQ(tensor_4d->dim(), 4);
+  EXPECT_EQ(tensor_4d->size(0), 2);
+  EXPECT_EQ(tensor_4d->size(1), 3);
+  EXPECT_EQ(tensor_4d->size(2), 4);
+  EXPECT_EQ(tensor_4d->size(3), 5);
+
+  // Test 5D tensor
+  std::vector<int64_t> sizes_5d = {1, 2, 3, 4, 5};
+  Tensor* tensor_5d = create_tracked_tensor(sizes_5d);
+  EXPECT_NE(tensor_5d, nullptr);
+  EXPECT_EQ(tensor_5d->dim(), 5);
+  EXPECT_EQ(tensor_5d->size(0), 1);
+  EXPECT_EQ(tensor_5d->size(1), 2);
+  EXPECT_EQ(tensor_5d->size(2), 3);
+  EXPECT_EQ(tensor_5d->size(3), 4);
+  EXPECT_EQ(tensor_5d->size(4), 5);
+}
diff --git a/backends/cuda/runtime/tests/TARGETS b/backends/cuda/runtime/tests/TARGETS
new file mode 100644
index 00000000000..9ff3e83a8bd
--- /dev/null
+++ b/backends/cuda/runtime/tests/TARGETS
@@ -0,0 +1,6 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/backends/cuda/runtime/tests/targets.bzl b/backends/cuda/runtime/tests/targets.bzl
new file mode 100644
index 00000000000..37e8d876526
--- /dev/null
+++ b/backends/cuda/runtime/tests/targets.bzl
@@ -0,0 +1,27 @@
+load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+
+def cuda_runtime_cpp_unittest(name):
+    cpp_unittest(
+        name = "test_" + name,
+        srcs = [
+            "test_" + name + ".cpp",
+        ],
+        deps = [
+            "//executorch/backends/cuda/runtime:runtime_shims",
+            "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/platform:platform",
+        ],
+        external_deps = [
+            ("cuda", None, "cuda-lazy"),
+        ],
+    )
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+    cuda_runtime_cpp_unittest("cuda_guard")
+    cuda_runtime_cpp_unittest("cuda_stream_guard")
diff --git a/backends/cuda/runtime/tests/test_cuda_guard.cpp b/backends/cuda/runtime/tests/test_cuda_guard.cpp
new file mode 100644
index 00000000000..a364ae98484
--- /dev/null
+++ b/backends/cuda/runtime/tests/test_cuda_guard.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+};
+
+TEST_F(CUDAGuardTest, BasicDeviceSwitching) {
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int device_after_guard;
+    ASSERT_EQ(cudaGetDevice(&device_after_guard), cudaSuccess);
+    EXPECT_EQ(device_after_guard, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), current_device);
+  }
+
+  int device_after_destruction;
+  ASSERT_EQ(cudaGetDevice(&device_after_destruction), cudaSuccess);
+  EXPECT_EQ(device_after_destruction, current_device);
+}
+
+TEST_F(CUDAGuardTest, SameDeviceNoSwitching) {
+  ASSERT_EQ(cudaSetDevice(0), cudaSuccess);
+
+  {
+    auto guard_result = CUDAGuard::create(0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAGuard guard = std::move(guard_result.get());
+
+    int current_device;
+    ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    EXPECT_EQ(current_device, 0);
+    EXPECT_EQ(guard.current_device(), 0);
+    EXPECT_EQ(guard.original_device(), 0);
+  }
+
+  int final_device;
+  ASSERT_EQ(cudaGetDevice(&final_device), cudaSuccess);
+  EXPECT_EQ(final_device, 0);
+}
+
+TEST_F(CUDAGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAGuard::create(999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAGuard::create(-2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAGuard>,
+      "CUDAGuard should not be copy constructible");
+}
+
+TEST_F(CUDAGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be copy assignable");
+}
+
+TEST_F(CUDAGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAGuard>,
+      "CUDAGuard should not be move assignable");
+}
diff --git a/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp
new file mode 100644
index 00000000000..68a050a69be
--- /dev/null
+++ b/backends/cuda/runtime/tests/test_cuda_stream_guard.cpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <executorch/backends/cuda/runtime/guard.h>
+#include <executorch/runtime/platform/platform.h>
+#include <gtest/gtest.h>
+
+using namespace executorch::backends::cuda;
+using namespace executorch::runtime;
+
+// TODO(gasoonjia): Multiple device tests were not included due to test
+// environment limitations. These tests should be added in the future when
+// multi-GPU test environments are available,
+
+class CUDAStreamGuardTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    et_pal_init();
+
+    int device_count = 0;
+    cudaError_t error = cudaGetDeviceCount(&device_count);
+    if (error != cudaSuccess || device_count == 0) {
+      GTEST_SKIP() << "CUDA not available or no CUDA devices found";
+    }
+    device_count_ = device_count;
+
+    ASSERT_EQ(cudaGetDevice(&original_device_), cudaSuccess);
+
+    ASSERT_EQ(cudaStreamCreate(&test_stream1_), cudaSuccess);
+    ASSERT_EQ(cudaStreamCreate(&test_stream2_), cudaSuccess);
+  }
+
+  void TearDown() override {
+    if (test_stream1_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream1_), cudaSuccess);
+    }
+    if (test_stream2_) {
+      ASSERT_EQ(cudaStreamDestroy(test_stream2_), cudaSuccess);
+    }
+
+    if (device_count_ > 0) {
+      ASSERT_EQ(cudaSetDevice(original_device_), cudaSuccess);
+    }
+  }
+
+  int device_count_ = 0;
+  int original_device_ = 0;
+  cudaStream_t test_stream1_ = nullptr;
+  cudaStream_t test_stream2_ = nullptr;
+};
+
+TEST_F(CUDAStreamGuardTest, BasicStreamSwitching) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  int current_device;
+  ASSERT_EQ(cudaGetDevice(&current_device), cudaSuccess);
+  EXPECT_EQ(current_device, 0);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamSwitchingOnSameDevice) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream2_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto new_stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(new_stream_result.ok());
+    EXPECT_EQ(new_stream_result.get(), test_stream2_);
+    EXPECT_EQ(guard.stream(), test_stream2_);
+  }
+
+  auto restored_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(restored_stream_result.ok());
+  EXPECT_EQ(restored_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, NestedStreamGuards) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+
+    {
+      auto guard2_result = CUDAStreamGuard::create(test_stream2_, 0);
+      ASSERT_TRUE(guard2_result.ok());
+      CUDAStreamGuard guard2 = std::move(guard2_result.get());
+
+      auto stream_result2 = getCurrentCUDAStream(0);
+      ASSERT_TRUE(stream_result2.ok());
+      EXPECT_EQ(stream_result2.get(), test_stream2_);
+    }
+
+    auto stream_result3 = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result3.ok());
+    EXPECT_EQ(stream_result3.get(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, SameStreamNoChange) {
+  Error err = setCurrentCUDAStream(test_stream1_, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard_result.ok());
+    CUDAStreamGuard guard = std::move(guard_result.get());
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), test_stream1_);
+    EXPECT_EQ(guard.stream(), test_stream1_);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, StreamAccessor) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+  EXPECT_EQ(guard.device_index(), 0);
+}
+
+TEST_F(CUDAStreamGuardTest, SetStreamMethod) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), test_stream1_);
+
+  Error err = guard.set_stream(test_stream2_, 0);
+  EXPECT_EQ(err, Error::Ok);
+
+  EXPECT_EQ(guard.stream(), test_stream2_);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream2_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructor) {
+  auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+  ASSERT_TRUE(guard1_result.ok());
+  CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+  EXPECT_EQ(guard1.stream(), test_stream1_);
+  EXPECT_EQ(guard1.device_index(), 0);
+
+  CUDAStreamGuard guard2 = std::move(guard1);
+
+  EXPECT_EQ(guard2.stream(), test_stream1_);
+  EXPECT_EQ(guard2.device_index(), 0);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+  EXPECT_EQ(current_stream_result.get(), test_stream1_);
+}
+
+TEST_F(CUDAStreamGuardTest, MoveConstructorRestoresOnlyOnce) {
+  cudaStream_t initial_stream;
+  ASSERT_EQ(cudaStreamCreate(&initial_stream), cudaSuccess);
+
+  Error err = setCurrentCUDAStream(initial_stream, 0);
+  ASSERT_EQ(err, Error::Ok);
+
+  {
+    auto guard1_result = CUDAStreamGuard::create(test_stream1_, 0);
+    ASSERT_TRUE(guard1_result.ok());
+    CUDAStreamGuard guard1 = std::move(guard1_result.get());
+
+    { CUDAStreamGuard guard2 = std::move(guard1); }
+
+    auto stream_result = getCurrentCUDAStream(0);
+    ASSERT_TRUE(stream_result.ok());
+    EXPECT_EQ(stream_result.get(), initial_stream);
+  }
+
+  auto final_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(final_stream_result.ok());
+  EXPECT_EQ(final_stream_result.get(), initial_stream);
+
+  ASSERT_EQ(cudaStreamDestroy(initial_stream), cudaSuccess);
+}
+
+TEST_F(CUDAStreamGuardTest, InvalidDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, 999);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) {
+  auto guard_result = CUDAStreamGuard::create(test_stream1_, -2);
+  EXPECT_FALSE(guard_result.ok());
+}
+
+TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) {
+  static_assert(
+      !std::is_copy_constructible_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy constructible");
+}
+
+TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) {
+  static_assert(
+      !std::is_copy_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be copy assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) {
+  static_assert(
+      !std::is_move_assignable_v<CUDAStreamGuard>,
+      "CUDAStreamGuard should not be move assignable");
+}
+
+TEST_F(CUDAStreamGuardTest, NullStreamPointer) {
+  auto guard_result = CUDAStreamGuard::create(nullptr, 0);
+  ASSERT_TRUE(guard_result.ok());
+  CUDAStreamGuard guard = std::move(guard_result.get());
+
+  EXPECT_EQ(guard.stream(), nullptr);
+
+  auto current_stream_result = getCurrentCUDAStream(0);
+  ASSERT_TRUE(current_stream_result.ok());
+}
diff --git a/backends/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h
new file mode 100644
index 00000000000..2d805724090
--- /dev/null
+++ b/backends/cuda/runtime/utils.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstdint>
+#include <vector>
+
+// CUDA error checking macro
+#define ET_CUDA_CHECK_OR_RETURN_ERROR(EXPR) \
+  do {                                      \
+    const cudaError_t err = EXPR;           \
+    if (err == cudaSuccess) {               \
+      break;                                \
+    }                                       \
+    ET_LOG(                                 \
+        Error,                              \
+        "%s:%d CUDA error: %s",             \
+        __FILE__,                           \
+        __LINE__,                           \
+        cudaGetErrorString(err));           \
+    return Error::Internal;                 \
+  } while (0)
+
+// Kernel launch check macro
+#define ET_CUDA_KERNEL_LAUNCH_CHECK_OR_RETURN_ERROR() \
+  ET_CUDA_CHECK_OR_RETURN_ERROR(cudaGetLastError())
+
+namespace executorch::backends::cuda {
+
+// Enum for supported data types in et-cuda backend
+enum class SupportedDTypes : int32_t {
+  INT64 = 4, // PyTorch's int64 dtype code
+  FLOAT32 = 6, // PyTorch's float32 dtype code
+  BFLOAT16 = 15, // PyTorch's bfloat16 dtype code
+};
+
+// Enum for supported device types in et-cuda backend
+enum class SupportedDevices : int32_t {
+  CPU = 0, // CPU device
+  CUDA = 1, // CUDA device
+};
+
+// Utility function to convert sizes pointer to vector
+inline std::vector<executorch::aten::SizesType> convert_sizes_to_vector(
+    int64_t ndim,
+    const int64_t* sizes_ptr) {
+  std::vector<executorch::aten::SizesType> sizes(ndim);
+  for (int i = 0; i < ndim; i++) {
+    sizes[i] = static_cast<executorch::aten::SizesType>(sizes_ptr[i]);
+  }
+  return sizes;
+}
+
+// Utility function to convert strides pointer to vector or calculate from sizes
+inline std::vector<executorch::aten::StridesType> convert_strides_to_vector(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr) {
+  std::vector<executorch::aten::StridesType> strides(ndim);
+
+  if (strides_ptr != nullptr) {
+    // Use provided strides. it is ok if provided strides here is not contiguous
+    // strides since it will be used internally in CUDA delegate.
+    for (int64_t i = 0; i < ndim; i++) {
+      strides[i] = static_cast<executorch::aten::StridesType>(strides_ptr[i]);
+    }
+  } else {
+    // Calculate strides from sizes using ExecutorTorch's algorithm
+    if (ndim > 0) {
+      strides[ndim - 1] = static_cast<executorch::aten::StridesType>(
+          1); // Last dimension has stride 1
+      for (int64_t i = ndim - 2; i >= 0; i--) {
+        if (sizes_ptr[i + 1] == 0) {
+          strides[i] = strides[i + 1]; // Copy stride when size is 0
+        } else {
+          strides[i] = static_cast<executorch::aten::StridesType>(
+              static_cast<int64_t>(strides[i + 1]) * sizes_ptr[i + 1]);
+        }
+      }
+    }
+  }
+  return strides;
+}
+
+extern "C" {
+using executorch::runtime::Error;
+// Common AOTI type aliases
+using AOTITorchError = Error;
+
+// Helper function to check if a dtype is supported in ET CUDA backend
+inline bool is_dtype_supported_in_et_cuda(int32_t dtype) {
+  switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::INT64):
+    case static_cast<int32_t>(SupportedDTypes::FLOAT32):
+    case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
+      return true;
+    default:
+      return false;
+  }
+}
+
+// Dtype validation utility function
+inline AOTITorchError validate_dtype(int32_t dtype) {
+  ET_CHECK_OR_RETURN_ERROR(
+      is_dtype_supported_in_et_cuda(dtype),
+      InvalidArgument,
+      "Unsupported dtype: %d. Supported dtypes: %d (int64), %d (float32), %d (bfloat16)",
+      dtype,
+      static_cast<int32_t>(SupportedDTypes::INT64),
+      static_cast<int32_t>(SupportedDTypes::FLOAT32),
+      static_cast<int32_t>(SupportedDTypes::BFLOAT16));
+
+  return Error::Ok;
+}
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/tests/TARGETS b/backends/cuda/tests/TARGETS
new file mode 100644
index 00000000000..12718c04388
--- /dev/null
+++ b/backends/cuda/tests/TARGETS
@@ -0,0 +1,41 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+load("@fbcode_macros//build_defs:python_unittest_remote_gpu.bzl", "python_unittest_remote_gpu")
+
+oncall("executorch")
+
+python_unittest_remote_gpu(
+    name = "test_cuda_export",
+    srcs = [
+        "test_cuda_export.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/cuda:cuda_backend",
+        "//executorch/backends/cuda:cuda_partitioner",
+        "//executorch/exir:lib",
+        "//executorch/exir/backend:backend_api",
+        "//executorch/exir/backend:compile_spec_schema",
+    ],
+    keep_gpu_sections = True,
+)
+
+python_unittest(
+    name = "test_cuda_partitioner",
+    srcs = [
+        "test_cuda_partitioner.py",
+    ],
+    visibility = [
+        "//executorch/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/cuda:cuda_partitioner",
+        "//executorch/backends/cuda:cuda_backend",
+        "//executorch/exir:lib",
+        "//executorch/exir/backend:compile_spec_schema",
+    ],
+)
diff --git a/backends/cuda/tests/__init__.py b/backends/cuda/tests/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/backends/cuda/tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/cuda/tests/test_cuda_export.py b/backends/cuda/tests/test_cuda_export.py
new file mode 100644
index 00000000000..d794a4f042c
--- /dev/null
+++ b/backends/cuda/tests/test_cuda_export.py
@@ -0,0 +1,253 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Tuple
+
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from torch.export import export
+
+
+class TestCudaExport(unittest.TestCase):
+    """Test CUDA export functionality for various operations using to_edge_transform_and_lower."""
+
+    def setUp(self):
+        """Set up test environment."""
+        # Skip tests if CUDA is not available
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA is not available")
+
+    def _export_to_cuda_with_lower(
+        self, module: torch.nn.Module, inputs: Tuple[torch.Tensor, ...]
+    ) -> None:
+        """Helper method to export a module to CUDA backend using to_edge_transform_and_lower."""
+        # Export the model
+        exported_program = export(module, inputs, strict=True)
+
+        # Create partitioner and compile specs
+        partitioner = CudaPartitioner(
+            [CudaBackend.generate_method_name_compile_spec("forward")]
+        )
+
+        # Use to_edge_transform_and_lower for complete pipeline
+        edge_program_manager = to_edge_transform_and_lower(
+            exported_program,
+            partitioner=[partitioner],
+            compile_config=EdgeCompileConfig(
+                _check_ir_validity=False,
+            ),
+        )
+
+        # Verify that the pipeline succeeded
+        self.assertIsNotNone(edge_program_manager)
+        self.assertTrue(hasattr(edge_program_manager, "exported_program"))
+
+        # Verify that the final exported program contains delegated calls
+        exported_program = edge_program_manager.exported_program()
+        has_delegate_call = False
+        for node in exported_program.graph.nodes:
+            if node.op == "call_function" and "executorch_call_delegate" in str(
+                node.target
+            ):
+                has_delegate_call = True
+                break
+
+        self.assertTrue(
+            has_delegate_call, "No delegate calls found in final exported program"
+        )
+
+        return edge_program_manager
+
+    def test_simple_add(self):
+        """Test CUDA export for simple element-wise addition."""
+
+        class AddModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return x + y
+
+        module = AddModule()
+        module.eval()
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Simple add operation export failed")
+
+    def test_conv2d(self):
+        """Test CUDA export for 2D convolution."""
+
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, padding=1)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.conv(x)
+
+        module = Conv2dModule()
+        module.eval()
+        inputs = (torch.randn(1, 3, 32, 32),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Conv2d operation export failed")
+
+    def test_linear(self):
+        """Test CUDA export for linear layer."""
+
+        class LinearModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(128, 64)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.linear(x)
+
+        module = LinearModule()
+        module.eval()
+        inputs = (torch.randn(8, 128),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Linear operation export failed")
+
+    def test_resnet_block(self):
+        """Test CUDA export for a ResNet-style block."""
+
+        class ResNetBlock(torch.nn.Module):
+            def __init__(self, in_channels: int, out_channels: int, stride: int = 1):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    bias=False,
+                )
+                # Use eval mode to avoid batch norm mutations during export
+                self.bn1 = torch.nn.BatchNorm2d(out_channels)
+                self.relu = torch.nn.ReLU(inplace=True)
+                self.conv2 = torch.nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+                self.bn2 = torch.nn.BatchNorm2d(out_channels)
+
+                # Shortcut connection
+                self.shortcut = torch.nn.Sequential()
+                if stride != 1 or in_channels != out_channels:
+                    self.shortcut = torch.nn.Sequential(
+                        torch.nn.Conv2d(
+                            in_channels,
+                            out_channels,
+                            kernel_size=1,
+                            stride=stride,
+                            bias=False,
+                        ),
+                        torch.nn.BatchNorm2d(out_channels),
+                    )
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                identity = self.shortcut(x)
+
+                out = self.conv1(x)
+                out = self.bn1(out)
+                out = self.relu(out)
+
+                out = self.conv2(out)
+                out = self.bn2(out)
+
+                out += identity
+                out = self.relu(out)
+
+                return out
+
+        module = ResNetBlock(64, 64)
+        # Set module to eval mode to avoid batch norm running statistics mutations
+        module.eval()
+        inputs = (torch.randn(1, 64, 32, 32),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "ResNet block export failed")
+
+    def test_multi_operation_module(self):
+        """Test CUDA export for a module with multiple operations."""
+
+        class MultiOpModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 32, kernel_size=3, padding=1)
+                self.relu = torch.nn.ReLU()
+                self.pool = torch.nn.AdaptiveAvgPool2d((1, 1))
+                self.linear = torch.nn.Linear(32, 10)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.conv(x)
+                x = self.relu(x)
+                x = self.pool(x)
+                x = x.view(x.size(0), -1)
+                x = self.linear(x)
+                return x
+
+        module = MultiOpModule()
+        module.eval()
+        inputs = (torch.randn(2, 3, 16, 16),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(
+            edge_program_manager, "Multi-operation module export failed"
+        )
+
+    def test_activation_functions(self):
+        """Test CUDA export for various activation functions."""
+
+        class ActivationModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                # Test multiple activation functions
+                x1 = torch.relu(x)
+                x2 = torch.sigmoid(x)
+                x3 = torch.tanh(x)
+                return x1 + x2 + x3
+
+        module = ActivationModule()
+        module.eval()
+        inputs = (torch.randn(4, 8),)
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(edge_program_manager, "Activation functions export failed")
+
+    def test_mathematical_operations(self):
+        """Test CUDA export for mathematical operations."""
+
+        class MathOpsModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                # Test various mathematical operations
+                add_result = x + y
+                mul_result = x * y
+                sub_result = x - y
+                div_result = x / (y + 1e-8)  # Add epsilon to avoid division by zero
+                return add_result + mul_result + sub_result + div_result
+
+        module = MathOpsModule()
+        module.eval()
+        inputs = (torch.randn(4, 4), torch.randn(4, 4))
+
+        # Test export
+        edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
+        self.assertIsNotNone(
+            edge_program_manager, "Mathematical operations export failed"
+        )
diff --git a/backends/cuda/tests/test_cuda_partitioner.py b/backends/cuda/tests/test_cuda_partitioner.py
new file mode 100644
index 00000000000..cb4a2def1f8
--- /dev/null
+++ b/backends/cuda/tests/test_cuda_partitioner.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Tuple
+
+import torch
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.exir.backend.partitioner import PartitionResult
+from torch.export import export
+
+
+class TestCudaPartitioner(unittest.TestCase):
+    """
+    Test CUDA partitioner functionality.
+
+    After CUDA partitioning, there should be exactly one partitioned graph that contains
+    all operators from the input graph. This means all operators should be tagged with
+    the same delegation tag, indicating they will all be executed by the CUDA backend.
+    """
+
+    def _get_partition_result(
+        self, module: torch.nn.Module, inputs: Tuple[torch.Tensor, ...]
+    ) -> PartitionResult:
+        """Helper method to get partition result for a given module."""
+        # Export the model
+        exported_program = export(module, inputs, strict=True)
+
+        # Create partitioner and compile specs
+        partitioner = CudaPartitioner([])
+
+        # Get partition result
+        partition_result = partitioner.partition(exported_program)
+
+        # Verify partition result structure
+        self.assertIsNotNone(partition_result)
+        self.assertTrue(hasattr(partition_result, "tagged_exported_program"))
+        self.assertTrue(hasattr(partition_result, "partition_tags"))
+
+        return partition_result
+
+    def _check_fully_partitioned(self, partition_result: PartitionResult) -> bool:
+        """Check if the graph is fully partitioned (all operators have the same tag)."""
+        tagged_nodes = []
+        untagged_ops = []
+
+        for node in partition_result.tagged_exported_program.graph.nodes:
+            if node.op == "call_function":
+                if hasattr(node, "meta") and "delegation_tag" in node.meta:
+                    tagged_nodes.append(node)
+                else:
+                    untagged_ops.append(node)
+
+        # Check if we have any tagged nodes
+        if not tagged_nodes:
+            return False
+
+        # Check if all tagged nodes have the same tag
+        first_tag = tagged_nodes[0].meta["delegation_tag"]
+        all_same_tag = all(
+            node.meta.get("delegation_tag") == first_tag for node in tagged_nodes
+        )
+
+        # Should have no untagged operations for full partitioning
+        fully_partitioned = len(untagged_ops) == 0 and all_same_tag
+
+        return fully_partitioned
+
+    def test_simple_add_partition(self):
+        """
+        Test that CUDA partitioner creates exactly one partition containing all operators.
+        Simple element-wise addition should result in a single graph with all ops tagged identically.
+        """
+
+        class AddModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return x + y
+
+        module = AddModule()
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+
+        partition_result = self._get_partition_result(module, inputs)
+        fully_partitioned = self._check_fully_partitioned(partition_result)
+
+        self.assertTrue(
+            fully_partitioned,
+            "Graph should be fully partitioned with all operators having the same tag",
+        )
+
+    def test_conv2d_partition(self):
+        """
+        Test that CUDA partitioner creates exactly one partition containing all operators.
+        Conv2D operation should result in a single graph with all ops tagged identically.
+        """
+
+        class Conv2dModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 16, kernel_size=3, padding=1)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.conv(x)
+
+        module = Conv2dModule()
+        inputs = (torch.randn(1, 3, 32, 32),)
+
+        partition_result = self._get_partition_result(module, inputs)
+        fully_partitioned = self._check_fully_partitioned(partition_result)
+
+        self.assertTrue(
+            fully_partitioned,
+            "Graph should be fully partitioned with all operators having the same tag",
+        )
+
+    def test_linear_partition(self):
+        """
+        Test that CUDA partitioner creates exactly one partition containing all operators.
+        Linear layer operation should result in a single graph with all ops tagged identically.
+        """
+
+        class LinearModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(128, 64)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.linear(x)
+
+        module = LinearModule()
+        inputs = (torch.randn(8, 128),)
+
+        partition_result = self._get_partition_result(module, inputs)
+        fully_partitioned = self._check_fully_partitioned(partition_result)
+
+        self.assertTrue(
+            fully_partitioned,
+            "Graph should be fully partitioned with all operators having the same tag",
+        )
diff --git a/backends/cuda/tests/voxtral_runner.cpp b/backends/cuda/tests/voxtral_runner.cpp
new file mode 100644
index 00000000000..feed458e1f5
--- /dev/null
+++ b/backends/cuda/tests/voxtral_runner.cpp
@@ -0,0 +1,264 @@
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/portable_type/tensor.h>
+
+namespace {
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using executorch::extension::module::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::Result;
+using Clock = std::chrono::steady_clock;
+using DurationMs = std::chrono::duration<double, std::milli>;
+
+std::vector<executorch::aten::SizesType> to_sizes(
+    std::initializer_list<int64_t> dims) {
+  return std::vector<executorch::aten::SizesType>(dims.begin(), dims.end());
+}
+
+std::string format_shape(const Tensor& tensor) {
+  std::ostringstream oss;
+  oss << "[";
+  const auto& sizes = tensor.sizes();
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    if (i > 0) {
+      oss << ", ";
+    }
+    oss << sizes[i];
+  }
+  oss << "]";
+  return oss.str();
+}
+
+void print_tensor_summary(const std::string& label, const Tensor& tensor) {
+  std::cout << "    " << label
+            << ": dtype=" << executorch::runtime::toString(tensor.scalar_type())
+            << ", shape=" << format_shape(tensor)
+            << ", numel=" << tensor.numel() << std::endl;
+}
+
+TensorPtr create_audio_input() {
+  const auto sizes = to_sizes({3, 128, 3000});
+  const size_t numel = 3ull * 128ull * 3000ull;
+  std::vector<float> data(numel, 0.5f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+TensorPtr create_token_ids_input() {
+  const auto sizes = to_sizes({1, 1138});
+  std::vector<int64_t> data(static_cast<size_t>(1) * 1138, 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_positions_input() {
+  const auto sizes = to_sizes({1138});
+  std::vector<int64_t> data(static_cast<size_t>(1138), 0);
+  return make_tensor_ptr<int64_t>(sizes, std::move(data));
+}
+
+TensorPtr create_fallback_text_embedding() {
+  const auto sizes = to_sizes({1, 1138, 3072});
+  const size_t numel = 1ull * 1138ull * 3072ull;
+  std::vector<float> data(numel, 0.0f);
+  return make_tensor_ptr<float>(
+      sizes, std::move(data), {}, {}, ScalarType::BFloat16);
+}
+
+struct MethodTiming {
+  double load_ms{0.0};
+  double run_ms{0.0};
+};
+
+} // namespace
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0]
+              << " <path/to/model.pte> <path/to/aoti_cuda_blob.ptd>"
+              << std::endl;
+    return 1;
+  }
+
+  const std::string program_path = argv[1];
+  const std::string data_map_path = argv[2];
+
+  try {
+    Module module(program_path, data_map_path);
+
+    const auto program_load_start = Clock::now();
+    const Error program_load_error = module.load();
+    const auto program_load_end = Clock::now();
+    if (program_load_error != Error::Ok) {
+      std::cerr << "Failed to load ExecuTorch program: error code "
+                << static_cast<int>(program_load_error) << std::endl;
+      return 1;
+    }
+    const DurationMs program_load_latency =
+        program_load_end - program_load_start;
+
+    MethodTiming audio_timing;
+    MethodTiming token_timing;
+    MethodTiming text_timing;
+
+    auto measure_method_load =
+        [&](const std::string& name) -> std::pair<Error, double> {
+      const auto start = Clock::now();
+      const Error err = module.load_method(name);
+      const auto end = Clock::now();
+      return {err, DurationMs(end - start).count()};
+    };
+
+    // audio_encoder
+    {
+      const auto [err, load_ms] = measure_method_load("audio_encoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method audio_encoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      audio_timing.load_ms = load_ms;
+
+      const TensorPtr audio_input = create_audio_input();
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(audio_input);
+      inputs.emplace_back(*audio_input);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("audio_encoder", inputs);
+      const auto run_end = Clock::now();
+      audio_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "audio_encoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("audio_encoder output", outputs[0].toTensor());
+      }
+    }
+
+    EValue token_output;
+    bool token_executed = false;
+
+    // token_embedding
+    {
+      const auto [err, load_ms] = measure_method_load("token_embedding");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method token_embedding: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      token_timing.load_ms = load_ms;
+
+      const TensorPtr token_ids = create_token_ids_input();
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      owned_inputs.emplace_back(token_ids);
+      inputs.emplace_back(*token_ids);
+
+      const auto run_start = Clock::now();
+      auto token_output_result = module.execute("token_embedding", inputs);
+      const auto run_end = Clock::now();
+      token_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (token_output_result.error() != Error::Ok) {
+        std::cerr << "token_embedding execution failed: error code "
+                  << static_cast<int>(token_output_result.error()) << std::endl;
+        return 1;
+      }
+
+      token_executed = true;
+      const auto& outputs = token_output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("token_embedding output", outputs[0].toTensor());
+        token_output = outputs[0];
+      }
+    }
+
+    // text_decoder
+    {
+      const auto [err, load_ms] = measure_method_load("text_decoder");
+      if (err != Error::Ok) {
+        std::cerr << "Failed to load method text_decoder: error code "
+                  << static_cast<int>(err) << std::endl;
+        return 1;
+      }
+      text_timing.load_ms = load_ms;
+
+      std::vector<EValue> inputs;
+      std::vector<TensorPtr> owned_inputs;
+      if (token_executed) {
+        if (token_output.isTensor()) {
+          inputs.emplace_back(token_output);
+        }
+      }
+
+      if (inputs.empty()) {
+        auto fallback_embedding = create_fallback_text_embedding();
+        owned_inputs.emplace_back(fallback_embedding);
+        inputs.emplace_back(*fallback_embedding);
+      }
+
+      auto positions = create_positions_input();
+      owned_inputs.emplace_back(positions);
+      inputs.emplace_back(*positions);
+
+      const auto run_start = Clock::now();
+      Result<std::vector<EValue>> output_result =
+          module.execute("text_decoder", inputs);
+      const auto run_end = Clock::now();
+      text_timing.run_ms = DurationMs(run_end - run_start).count();
+
+      if (output_result.error() != Error::Ok) {
+        std::cerr << "text_decoder execution failed: error code "
+                  << static_cast<int>(output_result.error()) << std::endl;
+        return 1;
+      }
+
+      const auto& outputs = output_result.get();
+      if (!outputs.empty() && outputs[0].isTensor()) {
+        print_tensor_summary("text_decoder output", outputs[0].toTensor());
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(3);
+    std::cout << "Program load latency (ms): " << program_load_latency.count()
+              << std::endl;
+
+    std::cout << "Method load latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.load_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.load_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.load_ms << std::endl;
+
+    std::cout << "Run latency (ms):" << std::endl;
+    std::cout << "  audio_encoder: " << audio_timing.run_ms << std::endl;
+    std::cout << "  token_embedding: " << token_timing.run_ms << std::endl;
+    std::cout << "  text_decoder: " << text_timing.run_ms << std::endl;
+
+    return 0;
+  } catch (const std::exception& ex) {
+    std::cerr << "Unhandled exception: " << ex.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/backends/mediatek/CMakeLists.txt b/backends/mediatek/CMakeLists.txt
index ed9b37e1998..10c28be0053 100644
--- a/backends/mediatek/CMakeLists.txt
+++ b/backends/mediatek/CMakeLists.txt
@@ -46,5 +46,5 @@ executorch_target_link_options_shared_lib(neuron_backend)
 install(
   TARGETS neuron_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/mediatek/README.md b/backends/mediatek/README.md
index e8a535b3fde..6ff751f8408 100644
--- a/backends/mediatek/README.md
+++ b/backends/mediatek/README.md
@@ -28,7 +28,7 @@ To get started with MediaTek's ExecuTorch libraries, download the [NeuroPilot Ex
 
 - **`mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`**: This library preprocesses the model into a MediaTek representation.
 
-- **`mtk_neuron-8.2.19-py3-none-linux_x86_64.whl`**: This library converts the model to binaries.
+- **`mtk_neuron-8.2.23-py3-none-linux_x86_64`**: This library converts the model to binaries.
 
 Additionally, make sure to copy `NeuronAdapter.h` to the following directory: `backends/mediatek/runtime/include/api/`.
 
@@ -45,7 +45,7 @@ Follow the steps below to setup your build environment:
    ```
 - Install the two .whl downloaded from NeuroPilot Portal
    ```bash
-   pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+   pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
    pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
    ```
 
diff --git a/backends/mediatek/scripts/mtk_build.sh b/backends/mediatek/scripts/mtk_build.sh
index 599f754d7bc..d42e5f7e10a 100755
--- a/backends/mediatek/scripts/mtk_build.sh
+++ b/backends/mediatek/scripts/mtk_build.sh
@@ -30,6 +30,7 @@ cmake -DCMAKE_INSTALL_PREFIX="${build_dir}" \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
       -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
       -DEXECUTORCH_BUILD_NEURON=ON \
       -B"${build_dir}"
 
diff --git a/backends/nxp/CMakeLists.txt b/backends/nxp/CMakeLists.txt
index 43fcaa24d19..bfc4c046be6 100644
--- a/backends/nxp/CMakeLists.txt
+++ b/backends/nxp/CMakeLists.txt
@@ -17,5 +17,5 @@ target_include_directories(
 install(
   TARGETS executorch_delegate_neutron
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/nxp/README.md b/backends/nxp/README.md
index 10eb1290a8b..de41cdd282e 100644
--- a/backends/nxp/README.md
+++ b/backends/nxp/README.md
@@ -15,7 +15,8 @@ networks, as well as the ability to adapt and scale to new model architectures,
 to AI workloads. ML application development with the eIQ Neutron NPU is fully supported by the 
 [eIQ machine learning software development environment](https://www.nxp.com/design/design-center/software/eiq-ml-development-environment/eiq-toolkit-for-end-to-end-model-development-and-deployment:EIQ-TOOLKIT).
 The eIQ AI SW Stack provides a streamlined development experience for developers and end-users of NXP products.
-eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices.
+eIQ extensions connect broader AI ecosystems to the edge, such as the NVIDIA TAO extension, which enables developers 
+to bring AI models trained and fine-tuned with TAO to NXP-powered edge devices.
 
 
 ## Supported NXP platforms
@@ -35,37 +36,28 @@ improvements. NXP and the ExecuTorch community is actively developing this codeb
 
 ## Neutron Backend implementation and SW architecture
 Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode. 
-The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler. 
-
-The Neutron Backend in its early prototype phase, is based on existing NXP products, such as 
-onnx2tflite, known from the NXP's eIQ Toolkit. 
-The **onnx2tflite** is a converter from the ONNX format to LiteRT (formerly known as TFLite).
-It consists of 3 stages: 
-* ONNX Model Parsing
-* Tensor Format Inference, to identify tensors using channel-first layer
-* ONNX to LiteRT Conversion 
-* Optimization Passes, which operate on top of the LiteRT format
-* LiteRT Serialization 
-
-Due to the similarities between ONNX to LiteRT and Edge to LiteRT conversion, the Neutron Backend's 
-currently leverages the Tensor format Inference and LiteRT Optimizer. 
-This shall be considered as temporary solution, intended to be replaced with: 
-* Dim Order (https://github.com/pytorch/executorch/issues/4873)
-* Corresponding ExecuTorch/ATen passes
-
-before reaching higher maturity status by the end of 2025. 
+The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend
+uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler.
 
 ## Layout
-The current code base is as follows:
 * `backend/ir/` - TFLite/LiteRT based IR to represent the Edge Subgraph, taken from onnx2tflite code base and extended to
   support Edge Dialect to LiteRT conversion.
     * `backend/ir/converter` - Neutron Backends conversion from Edge (ATen) Dialect to LiteRT, TFLite. The subfolder
       `node_conveters` is structured as single module for each Edge operator.
-    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema
+    * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema.
     * `backend/ir/tflite_generator` and `backend/ir/tflite_optimizer` handle the serialization
        of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers 
        representation. Code taken from the onnx2tflite tool.
-*  `quantizer` - Neutron Backends quantizer implementation. 
+*  `edge_passes` - Various passes operating on Edge dialect level. 
+*  `quantizer` - Neutron Backend quantizer implementation. 
+*  `runtime` - Neutron Backend runtime implementation. For running compiled on device.
+*  `tests/` - Unit tests for Neutron backend.
+    * `tests/converter/node_converter` - Operator level unit tests.
+
+* `examples/nxp/` - Example models and scripts for running them.
+
+## Examples
+Please see this [README.md](https://github.com/pytorch/executorch/blob/main/examples/nxp/README.md).
 
 ## Help & Improvements
 If you have problems or questions or have suggestions for ways to make
diff --git a/backends/nxp/aten_passes/fuse_linear_and_add_pass.py b/backends/nxp/aten_passes/fuse_linear_and_add_pass.py
new file mode 100644
index 00000000000..20a32c1bcac
--- /dev/null
+++ b/backends/nxp/aten_passes/fuse_linear_and_add_pass.py
@@ -0,0 +1,204 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+
+from executorch.backends.nxp.backend.edge_helper import (
+    try_get_tensor_constant_from_node,
+)
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.export.unflatten import _assign_attr, _AttrKind
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+
+class FuseLinearAndAddPass(PassBase):
+    """Replace a sequence of `linear` and `add` nodes in the following pattern by a single `linear` node when possible.
+           │
+    ┌──────▼──────┐
+    │ aten.linear │
+    └──────┬──────┘                               │
+           │            replace with       ┌──────▼──────┐
+     ┌─────▼────┐       ───────────►       │ aten.linear │
+     │ aten.add │                          └──────┬──────┘
+     └─────┬────┘
+           ▼
+    """
+
+    def _fuse_with_existing_bias(
+        self,
+        linear_node: Node,
+        other_add_input: Node,
+        graph_module: GraphModule,
+        alpha: float,
+    ) -> bool:
+        """Fuse the `linear` and `add` nodes provided the `linear` already has a bias.
+         The fusion can only be done if both the "biases" have static data, which can be added together to get a
+         single bias.
+
+        :return: True, if the nodes were successfully merged. False, otherwise.
+        """
+
+        linear_bias = linear_node.args[2]
+        if other_add_input.meta["val"].shape != linear_bias.meta["val"].shape:
+            # The biases cannot be added together due to their different shapes.
+            # Shape broadcasting is not applicable, as the only allowed `linear` bias shape is 1D ([output_features]).
+            return False
+
+        bias_data = [
+            try_get_tensor_constant_from_node(graph_module, linear_bias),
+            try_get_tensor_constant_from_node(graph_module, other_add_input),
+        ]
+        if any(data is None for data in bias_data):
+            return (
+                False  # Fusion is not possible because at least 1 bias is not static.
+            )
+
+        # Add the bias data together, to obtain the combined bias. Take the `alpha` attribute into account.
+        combined_bias = bias_data[0] + bias_data[1] * alpha
+
+        # Create a new node containing the combined bias data.
+        combined_bias_name = get_new_attr_name_with_prefix(
+            linear_bias.name + "combined"
+        )(graph_module)
+        _assign_attr(
+            torch.nn.Parameter(combined_bias),
+            graph_module,
+            combined_bias_name,
+            _AttrKind.PARAMETER,
+        )
+        with graph_module.graph.inserting_before(linear_node):
+            new_bias_node = graph_module.graph.get_attr(combined_bias_name)
+
+        # Use the combined bias as the new bias for the `Linear`.
+        linear_node.args = (
+            linear_node.args[:2] + (new_bias_node,) + linear_node.args[3:]
+        )
+        return True
+
+    def _fuse_without_existing_bias(
+        self,
+        linear_node: Node,
+        other_add_input: Node,
+        graph_module: GraphModule,
+        alpha: float,
+    ) -> bool:
+        """Fuse the `linear` and `add` provided the `linear` does not already have a bias.
+
+        :return: True, if the nodes were successfully merged. False, otherwise.
+        """
+
+        # The weights have shape (out_features, in_features).
+        output_features = linear_node.args[1].meta["val"].shape[0]
+        new_bias_shape = other_add_input.meta["val"].shape
+        if list(new_bias_shape) != [output_features]:
+            return False  # The `Add` is adding a tensor with shape that is not supported for the `Linear` bias.
+
+        bias_data = try_get_tensor_constant_from_node(graph_module, other_add_input)
+
+        if bias_data is None:
+            return False  # Neutron doesn't support a dynamic bias, so fusion would be counterproductive.
+
+        # It is possible that the `linear` comes before the `other_add_input` in the graph, so it cannot use it as an
+        #  input directly. If the nodes are ordered as [linear, ..., other_add_input, ... add] (which is valid), using
+        #  `other_add_input` directly as an input to `Linear` would not follow topological order.
+        # Rearranging the nodes is not trivial, as the graph could be complex (ultimately, the
+        #  `other_add_input` could even originate from the `Linear` node...).
+        # Since the `other_add_input` has static data, we can create a new node with the data just before the `Linear`
+        #  to ensure topological order.
+        # Regardless of the node ordering, the `add.Tensor` attribute `alpha` multiplies the second `add` input. If
+        #  `alpha != 1`, we would have to insert a `mul` operator if we wanted to keep the original parameter node.
+        #  Therefore, it is better to create a new static parameter node for the multiplied data in this case as well.
+        nodes = list(graph_module.graph.nodes)
+        if nodes.index(linear_node) < nodes.index(other_add_input) or alpha != 1.0:
+            # Problematic order, or required multiplication.
+
+            # Handle the `aten.add.Tensor` attribute `alpha`.
+            bias_data *= alpha
+
+            # Create a unique name.
+            new_bias_name = get_new_attr_name_with_prefix(linear_node.name + "_bias")(
+                graph_module
+            )
+            _assign_attr(bias_data, graph_module, new_bias_name, _AttrKind.PARAMETER)
+            with graph_module.graph.inserting_before(linear_node):
+                new_bias_node = graph_module.graph.get_attr(new_bias_name)
+
+            # Use the added tensor as the new `Linear` bias.
+            linear_node.args = (
+                linear_node.args[:2] + (new_bias_node,) + linear_node.args[2:]
+            )
+            return True
+
+        else:
+            # Use the `other_add_input` directly as the new bias.
+            linear_node.args = (
+                linear_node.args[:2] + (other_add_input,) + linear_node.args[2:]
+            )
+            return True
+
+    def call(self, graph_module: GraphModule) -> Optional[PassResult]:
+        def _is_applicable_linear_node(node_: Node):
+            is_linear = (
+                node_.op == "call_function"
+                and node_.target == torch.ops.aten.linear.default
+            )
+            has_single_user = len(node.users) == 1
+
+            return is_linear and has_single_user
+
+        def _is_add(node_: Node):
+            return (
+                node_.op == "call_function"
+                and node_.target == torch.ops.aten.add.Tensor
+            )
+
+        made_changes = False
+        for node in graph_module.graph.nodes:
+            if not _is_applicable_linear_node(
+                linear_node := node
+            ):  # Also ensures a single user.
+                continue
+
+            if not _is_add(add_node := list(linear_node.users.keys())[0]):
+                continue  # Not the `Linear` -> `Add` case.
+
+            if len(add_node.args) != 2:
+                continue  # Unexpected case.
+
+            # The `aten.add.Tensor` carries out the expression `out = input[0] + alpha × input[1]`.
+            # https://docs.pytorch.org/docs/stable/generated/torch.add.html
+            alpha = add_node.kwargs.get("alpha", 1.0)
+            if add_node.args[0] == linear_node:
+                other_add_input = add_node.args[1]
+
+            else:
+                # The fusion is not implemented. The `other_add_input` would have to be divided by `alpha` before the
+                #  fusion, and a `mul` operator would have to be added after the `linear` to multiply its output by
+                #  `alpha`.
+                continue
+
+            if len(linear_node.args) > 2:
+                if not self._fuse_with_existing_bias(
+                    linear_node, other_add_input, graph_module, alpha
+                ):
+                    continue  # The nodes could not be fused.
+
+            else:
+                # The `Linear` doesn't have a bias yet.
+                if not self._fuse_without_existing_bias(
+                    linear_node, other_add_input, graph_module, alpha
+                ):
+                    continue  # The nodes could not be fused.
+
+            # Use the output of the `Linear` instead of the `Add`, and remove the now unused `Add` node.
+            add_node.replace_all_uses_with(linear_node)
+            graph_module.graph.erase_node(add_node)
+
+            made_changes = True
+
+        return PassResult(graph_module, made_changes)
diff --git a/backends/nxp/aten_passes/neutron_aten_pass_manager.py b/backends/nxp/aten_passes/neutron_aten_pass_manager.py
index f6e3c374b19..407ebf5da61 100644
--- a/backends/nxp/aten_passes/neutron_aten_pass_manager.py
+++ b/backends/nxp/aten_passes/neutron_aten_pass_manager.py
@@ -13,6 +13,9 @@
 from executorch.backends.nxp.aten_passes.fuse_batch_norm_with_linear_pass import (
     FuseBatchNormWithLinearPass,
 )
+from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import (
+    FuseLinearAndAddPass,
+)
 from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import (
     RemoveNodesWithKnownOutputs,
 )
@@ -38,6 +41,7 @@ def __init__(self, passes: list[PassType] = None):
             SplitGroupConvolution(),
             SplitGRUBasedOnNumLayers(),
             RemoveNodesWithKnownOutputs(),
+            FuseLinearAndAddPass(),
         ]
 
         super().__init__(passes)
diff --git a/backends/nxp/backend/edge_helper.py b/backends/nxp/backend/edge_helper.py
index 061295ead79..60b367c0f39 100644
--- a/backends/nxp/backend/edge_helper.py
+++ b/backends/nxp/backend/edge_helper.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/backend/edge_program_converter.py b/backends/nxp/backend/edge_program_converter.py
index ddbbf5b2e3a..fcfb9787715 100644
--- a/backends/nxp/backend/edge_program_converter.py
+++ b/backends/nxp/backend/edge_program_converter.py
@@ -18,6 +18,7 @@
 from torch.fx import Node
 from torch.nn.parameter import Parameter
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.backend.node_format_inference import (
     NodeFormat,
     NodeFormatInference,
@@ -33,6 +34,7 @@
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
+    exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
@@ -42,6 +44,7 @@
     exir_ops.edge.aten.permute_copy.default: PermuteCopyConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.sub.Tensor: SubTensorConverter,  # noqa F405
     exir_ops.edge.aten.tanh.default: TanhConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
     exir_ops.edge.aten.sigmoid.default: SigmoidConverter,  # noqa F405
@@ -54,12 +57,14 @@ class EdgeProgramToIRConverter:
     """
 
     _default_conversion_config = ConversionConfig()
+    _default_target_spec = NeutronTargetSpec("imxrt700", "SDK_25_09")
     _default_delegation_options = CustomDelegationOptions()
 
     def convert_program(
         self,
         edge_program: ExportedProgram,
-        conversion_config=_default_conversion_config,
+        conversion_config: ConversionConfig = _default_conversion_config,
+        neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
     ) -> (bytes, dict):
         """
@@ -67,6 +72,7 @@ def convert_program(
 
         :param edge_program: Converter ExportedProgram.
         :param conversion_config: ConversionConfig instance.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
         :return: TFLite flatbuffers as bytes.
         """
@@ -76,6 +82,7 @@ def convert_program(
         cc = self.build_conversion_context(
             parameters_mapping,
             node_formats,
+            neutron_target_spec,
             conversion_config,
             custom_delegation_options,
         )
@@ -134,6 +141,7 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
 
         qdq_related_functions = [
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
 
@@ -172,11 +180,12 @@ def map_inputs_to_parameters(edge_program: ExportedProgram) -> dict[str, Paramet
     def build_conversion_context(
         parameters_mapping: dict,
         node_formats: dict[Node, NodeFormat],
+        neutron_target_spec: NeutronTargetSpec,
         conversion_config: ConversionConfig = _default_conversion_config,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
     ) -> ConversionContext:
         tflite_builder = AtenModelBuilderDirector(
-            3, "TFLite from EdgeProgram", conversion_config
+            3, "TFLite from EdgeProgram", neutron_target_spec, conversion_config
         )
 
         # Add "sentinel" buffer (defined in schema.fbs)
@@ -203,7 +212,8 @@ def _convert_qdq_cluster_q_dq_nodes(
         :param conversion_context: ConversionContext instance.
         """
         qdq_q_ops_converters = {
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: QDQPerTensorDequantizeConverter,  # noqa F405
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: QDQPerChannelDequantizeConverter,  # noqa F405
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: QDQQuantizeConverter,  # noqa F405
         }
 
diff --git a/backends/nxp/backend/ir/conversion_config.py b/backends/nxp/backend/ir/conversion_config.py
index 4ac88eb467c..622735e881f 100644
--- a/backends/nxp/backend/ir/conversion_config.py
+++ b/backends/nxp/backend/ir/conversion_config.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -14,7 +14,6 @@ def __init__(self, args: dict | None = None):
         :param args: Optional dictionary with conversion arguments. Unknown arguments are ignored.
         """
         self.keep_io_format: bool = False
-        self.skip_shape_inference: bool = False
         self.allow_inputs_stripping: bool = True
         self.qdq_aware_conversion: bool = True
         self.symbolic_dimensions_mapping: dict[str, int] | None = None
@@ -46,15 +45,6 @@ def __repr__(self):
         return "ConversionConfig[" + ", ".join(attrs) + "]"
 
 
-class SkipShapeInferenceConfig(ConversionConfig):
-
-    def __init__(self):
-        """
-        Conversion config shortcut with disabled shape inference.
-        """
-        super().__init__({"skip_shape_inference": True})
-
-
 class QDQAwareConfig(ConversionConfig):
 
     def __init__(self):
diff --git a/backends/nxp/backend/ir/converter/builder/model_builder.py b/backends/nxp/backend/ir/converter/builder/model_builder.py
index 4f036854138..643a6231d15 100755
--- a/backends/nxp/backend/ir/converter/builder/model_builder.py
+++ b/backends/nxp/backend/ir/converter/builder/model_builder.py
@@ -1,6 +1,6 @@
 #
 # Copyright 2023 Martin Pavella
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # License: MIT
 # See the LICENSE_MIT for more details.
@@ -48,6 +48,7 @@
     FlexTranspose,
 )
 from executorch.backends.nxp.backend.ir.tflite_optimizer import optimizer
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 
 
 class ModelBuilder:
@@ -74,17 +75,21 @@ class ModelBuilder:
 
     _zeros_tensor_map: Dict  # Mapping 'string' shapes to 'tflT.Tensor' objects
 
-    _default_conversion_config = ConversionConfig()
+    neutron_target_spec: NeutronTargetSpec
 
     conversion_config: ConversionConfig
 
+    _default_conversion_config = ConversionConfig()
+
     def __init__(
         self,
         model_version: int,
         model_description: str,
+        neutron_target_spec: NeutronTargetSpec,
         conversion_config: ConversionConfig = _default_conversion_config,
     ) -> None:
         self._tfl_model = tflite_model.Model(model_version, model_description)
+        self.neutron_target_spec = neutron_target_spec
         self.conversion_config = conversion_config
 
         self.op_code_type_index_map = {}
@@ -471,31 +476,7 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
-    def _assign_tensor_and_buffer_indices(  # noqa C901
-        self, allow_inputs_stripping: bool
-    ):
-        """Correctly initialize all references via indices in all tensors and buffers."""
-
-        # Assign each buffer its index
-        for i, buffer in enumerate(self.get_buffers().vector):
-            buffer.tmp_index = i
-
-        # Assign each tensor its index and its buffer index
-        for i, tensor in enumerate(self.get_tensors().vector):
-            if tensor.tmp_null_tensor:
-                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
-                #  this tensor should not be used.
-                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
-                tensor.tmp_index = -1
-            else:
-                tensor.tmp_index = i
-
-            tensor.buffer = tensor.tmp_buffer.tmp_index
-
-        # TODO Remove inputs and outputs that are not in the tensors collection
-
-        # Assign 'Outputs' and 'Inputs' their tensor indices
-        outputs = self.get_sub_graph().outputs
+    def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:
                 outputs.append(tensor.tmp_index)
@@ -505,7 +486,6 @@ def _assign_tensor_and_buffer_indices(  # noqa C901
                     f"The tensor '{tensor.name}' is among the model outputs, but does NOT appear in the graph!",
                 )
 
-        inputs = self.get_sub_graph().inputs
         for tensor in inputs.tmp_inputs:
             try:
                 inputs.append(tensor.tmp_index)
@@ -520,14 +500,46 @@ def _assign_tensor_and_buffer_indices(  # noqa C901
                         f"The tensor '{tensor.name}' is among the model inputs, but does NOT appear in the graph!",
                     )
 
-        # Assign each operator its inputs and outputs indices
-        for operator in self.get_sub_graph().operators.vector:
+    def _assign_operators_io_tensor_indices(self, operators):
+        for operator in operators.vector:
             for inputTensor in operator.tmp_inputs:
                 operator.inputs.append(inputTensor.tmp_index)
 
             for outputTensor in operator.tmp_outputs:
                 operator.outputs.append(outputTensor.tmp_index)
 
+    def _assign_tensor_and_buffer_indices(self, allow_inputs_stripping: bool):
+        """Correctly initialize all references via indices in all tensors and buffers."""
+
+        # Assign each buffer its index
+        for i, buffer in enumerate(self.get_buffers().vector):
+            buffer.tmp_index = i
+
+        # Assign each tensor its index and its buffer index
+        for i, tensor in enumerate(self.get_tensors().vector):
+            if tensor.tmp_null_tensor:
+                # Using -1 as the index to the 'tensors' vector is way of telling the TFLite inference engine, that
+                #  this tensor should not be used.
+                # https://github.com/tensorflow/tensorflow/blob/05404d959119d41a8ffb8a75c6f232cfd8540d45/tensorflow/lite/kernels/kernel_util.cc#L79-L98
+                tensor.tmp_index = -1
+            else:
+                tensor.tmp_index = i
+
+            tensor.buffer = tensor.tmp_buffer.tmp_index
+
+        # TODO Remove inputs and outputs that are not in the tensors collection
+
+        subgraph = self.get_sub_graph()
+
+        # Assign 'Outputs' and 'Inputs' their tensor indices
+        self._assign_io_tensor_indices(
+            inputs=subgraph.inputs,
+            outputs=subgraph.outputs,
+            allow_inputs_stripping=allow_inputs_stripping,
+        )
+        # Assign each operator its inputs and outputs indices
+        self._assign_operators_io_tensor_indices(operators=subgraph.operators)
+
     def _build_operator_code(
         self, op_type: BuiltinOperator, version, custom_code: str = None
     ):
@@ -795,29 +807,8 @@ def _remove_tensor_with_name(self, name):
 
     def append_new_tensor(self, t_tensor: tflite_model.Tensor, overwrite: bool = False):
         """Append the TFLite tensor 't_tensor' to the 'SubGraph.tensors' and register it."""
-
-        if t_tensor.name in self._tensor_name_map.keys():
-            """Tensor has already been added. Sometimes however, ONNX models
-            will have tensors in their 'inputs' or 'outputs', which don't
-            belong there and are in fact static. I this case we need to
-            overwrite the existing tensors."""
-
-            if overwrite:
-                self._remove_tensor_with_name(t_tensor.name)
-
-                # If the tenor previously appeared in ONNX 'inputs' or 'outputs',
-                # the old version MUST be removed from there.
-                self._remove_input_with_name(t_tensor.name)
-                self._remove_output_with_name(t_tensor.name)
-
-                self.get_tensors().append(t_tensor)
-                self._tensor_name_map[t_tensor.name] = t_tensor
-            else:
-                logger.w(f"Tensor '{t_tensor.name}' is already in the tensors!")
-
-        else:
-            self._tensor_name_map[t_tensor.name] = t_tensor
-            self.get_tensors().append(t_tensor)
+        self._tensor_name_map[t_tensor.name] = t_tensor
+        self.get_tensors().append(t_tensor)
 
     def append_new_buffer(self, buffer: tflite_model.Buffer):
         """Append the 'buffer' to the 'model.buffers'."""
@@ -1515,7 +1506,7 @@ def prepare_dynamic_tensor_for_correct_broadcasting_with_channels_first_tensors(
             # Prepend a partial identity, to keep leading dimensions unchanged.
             revert_perm = list(range(rank_diff)) + list(revert_perm)
 
-            # Now add a permutation to convert the extended ONNX shape to a TFLite shape
+            # Now add a permutation to convert the extended ExecuTorch shape to a TFLite shape
             to_tflite_perm = (
                 translator.create_channels_first_to_channels_last_permutation(
                     output_rank
@@ -1579,20 +1570,20 @@ def prepare_static_tensor_for_correct_broadcasting_with_channels_first_tensors(
 
             original_shape = translator.dims_to_channels_first(
                 shape
-            )  # Same shape as in the ONNX model
+            )  # Same shape as in the ExecuTorch model
 
             # Prepend 1s to the shape
-            extended_onnx_shape = [1] * rank_diff + original_shape
+            extended_executorch_shape = [1] * rank_diff + original_shape
 
             # Convert the full shape to TFLite format
-            tflite_shape = translator.dims_to_channels_last(extended_onnx_shape)
+            tflite_shape = translator.dims_to_channels_last(extended_executorch_shape)
             tensor.shape = tflite_model.Shape(tflite_shape)
 
             # Statically transpose the data
             data = translator.convert_data_to_channels_first(
                 data
-            )  # To the same shape as in the ONNX model
-            data = data.reshape(extended_onnx_shape)  # Extend with leading 1s
+            )  # To the same shape as in the ExecuTorch model
+            data = data.reshape(extended_executorch_shape)  # Extend with leading 1s
             tensor.tmp_buffer.data = translator.convert_data_to_channels_last(
                 data
             )  # Convert to TFLite format
@@ -1600,16 +1591,16 @@ def prepare_static_tensor_for_correct_broadcasting_with_channels_first_tensors(
             assert tflite_shape == list(tensor.tmp_buffer.data.shape)
 
         else:
-            # The tensor is the same as in the ONNX model.
+            # The tensor is the same as in the ExecuTorch model.
 
-            extended_onnx_shape = [1] * rank_diff + shape
+            extended_executorch_shape = [1] * rank_diff + shape
 
             # Convert the full shape to TFLite format
-            tflite_shape = translator.dims_to_channels_last(extended_onnx_shape)
+            tflite_shape = translator.dims_to_channels_last(extended_executorch_shape)
             tensor.shape = tflite_model.Shape(tflite_shape)
 
             # Statically transpose the data
-            data = data.reshape(extended_onnx_shape)  # Extend with leading 1s
+            data = data.reshape(extended_executorch_shape)  # Extend with leading 1s
             tensor.tmp_buffer.data = translator.convert_data_to_channels_last(
                 data
             )  # Convert to TFLite format
diff --git a/backends/nxp/backend/ir/converter/conversion/common.py b/backends/nxp/backend/ir/converter/conversion/common.py
index 8230e39a7fa..318fe66dfbd 100755
--- a/backends/nxp/backend/ir/converter/conversion/common.py
+++ b/backends/nxp/backend/ir/converter/conversion/common.py
@@ -1,6 +1,6 @@
 #
 # Copyright 2023 Martin Pavella
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # License: MIT
 # See the LICENSE_MIT for more details.
@@ -12,7 +12,7 @@
 'conversion/builtin/' directory.
 """
 
-from typing import Any, List, MutableSequence, Optional
+from typing import List, MutableSequence, Optional
 
 import executorch.backends.nxp.backend.ir.logger as logger
 from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
@@ -22,28 +22,8 @@
     max_pool_2d_options,
     transpose_conv_options,
 )
-from torch.fx import Node
-
-
-def exactly_one_is_none(obj1: Optional, obj2: Optional) -> bool:
-    """Determine if exactly 1 of the arguments is None, or not."""
-    return (obj1 is None and obj2 is not None) or (obj1 is not None and obj2 is None)
-
-
-def contains_duplicates(list_to_check: List[Any]) -> bool:
-    """Determine if given list has duplicate elements or not."""
-    return len(list_to_check) != len(set(list_to_check))
-
-
-def clamp(val: int, start: int, end: int) -> int:
-    """Clamp an int value between start and end (inclusive) and return it."""
-    if val < start:
-        return start
-
-    elif val > end:
-        return end
 
-    return val
+from torch.fx import Node
 
 
 def try_get_input(t_op: tflite_model.Operator, idx: int) -> tflite_model.Tensor | None:
@@ -62,11 +42,6 @@ def try_get_input(t_op: tflite_model.Operator, idx: int) -> tflite_model.Tensor
 
     tensor = t_op.tmp_inputs[idx]
 
-    if tensor.name == "":
-        # ONNX allows the name "" for optional tensors. It indicates that the tensor should be ignored, and a default
-        #  value should be used. Just like if the tensor was omitted altogether.
-        return None
-
     return tensor
 
 
@@ -101,7 +76,7 @@ def assign_2d_strides(options: StridedOptions, strides: Optional[List[int]]):
          If 'strides' is None, assign 1s.
 
     :param options: TFLite AveragePool2D, Conv2D, MaxPool2D or TransposeConv options object.
-    :param strides: An optional list of ONNX strides attribute.
+    :param strides: An optional list of ExecuTorch strides attribute.
     """
 
     if strides is None:
@@ -115,8 +90,8 @@ def assign_2d_strides(options: StridedOptions, strides: Optional[List[int]]):
 
     else:
         logger.e(
-            logger.Code.INVALID_ONNX_OPERATOR_ATTRIBUTE,
-            f"ONNX operator has invalid 'strides' attribute! ('{strides}')",
+            logger.Code.INVALID_OPERATOR_ATTRIBUTE,
+            f"ExecuTorch operator has invalid 'strides' attribute! ('{strides}')",
         )
 
 
@@ -188,32 +163,6 @@ def node_uses_shape_broadcasting(node: Node) -> bool:
     )
 
 
-def uses_multiple_input_types(t_op: tflite_model.Operator) -> bool:
-    """Determine if the input tensors of given TFLite operator use different data types or not.
-
-    :param t_op: TFLite operator with 'tmp_inputs' initialized.
-    :return: True, if any two input tensors have a different data type.
-             False, if all input tensors use the same data type.
-    """
-
-    if t_op.tmp_inputs is None:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "common.uses_multiple_input_types(): 'tmp_inputs' are None!",
-        )
-
-    if len(t_op.tmp_inputs) == 0:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "common.uses_multiple_input_types(): Operator has no inputs!",
-        )
-
-    first_input_type = t_op.tmp_inputs[0].type
-    return any(
-        input_tensor.type != first_input_type for input_tensor in t_op.tmp_inputs[1:]
-    )
-
-
 class OpsList:
     """
     Holder of TFLite operator (middle_op) that can be prefixed (pre_ops) of suffixed (post_ops)
diff --git a/backends/nxp/backend/ir/converter/conversion/translator.py b/backends/nxp/backend/ir/converter/conversion/translator.py
index 4f327c6ac80..1fe195843c0 100755
--- a/backends/nxp/backend/ir/converter/conversion/translator.py
+++ b/backends/nxp/backend/ir/converter/conversion/translator.py
@@ -1,6 +1,5 @@
-#
 # Copyright 2023 Martin Pavella
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # License: MIT
 # See the LICENSE_MIT for more details.
@@ -9,10 +8,10 @@
     translator
 
 Module contains functions for context-free conversion of various
-things from ONNX to TFLite.
+things from ExecuTorch to NeutronIR.
 """
 
-from typing import Any, Collection, List, Optional, Sequence, Tuple
+from typing import Any, Collection, List, Optional, Sequence
 
 import executorch.backends.nxp.backend.ir.lib.tflite.Padding as tflPadding
 import executorch.backends.nxp.backend.ir.logger as logger
@@ -21,16 +20,12 @@
 import numpy as np
 import torch
 from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
-from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
-from executorch.backends.nxp.backend.ir.tflite_generator.meta.types import (
-    TensorFlowDataType,
-)
 
 
 def permute_static_tensor(tensor: tflite_model.Tensor, perm: list[int]):
-    """Take a static TFLite tensor and permute its shape and data according to the permutation in 'perm'.
+    """Take a static NeutronIR tensor and permute its shape and data according to the permutation in 'perm'.
 
-    :param tensor: Static TFLite tensor to permute.
+    :param tensor: Static NeutronIR tensor to permute.
     :param perm: Permutation to apply to the tensor.
     """
 
@@ -53,7 +48,7 @@ def permute_static_tensor(tensor: tflite_model.Tensor, perm: list[int]):
 def get_tflite_tensor_shape_with_explicit_padding(
     tflite_shape: List[int], explicit_padding: List[List[int]]
 ) -> List[int]:
-    """Get the resulting shape of a tensor with shape 'tflite_shape' (in TFLite format), after 'explicit_padding' is
+    """Get the resulting shape of a tensor with shape 'tflite_shape' (in NeutronIR format), after 'explicit_padding' is
     applied to it.
     """
 
@@ -62,7 +57,7 @@ def get_tflite_tensor_shape_with_explicit_padding(
     ):
         logger.e(
             logger.Code.INTERNAL_ERROR,
-            f"Cannot apply padding '{explicit_padding}' to TFLite shape '{tflite_shape}'!",
+            f"Cannot apply padding '{explicit_padding}' to NeutronIR shape '{tflite_shape}'!",
         )
 
     total_padding = [
@@ -90,24 +85,9 @@ def get_tflite_tensor_shape_with_explicit_padding(
     return padded_shape
 
 
-def convert_tensor_format_to_tflite(tensor_format: TensorFormat) -> TensorFormat:
-    """Convert the format of a tensor from ONNX to TFLite.
-    :return: The tensor_format converted to TFLite.
-    """
-    if tensor_format is TensorFormat.CHANNELS_FIRST:
-        return TensorFormat.CHANNELS_LAST
-
-    elif tensor_format not in (TensorFormat.FORMATLESS, TensorFormat.NONE):
-        logger.d(
-            f"translator.convert_tensor_format(): Got unexpected format '{tensor_format}'."
-        )
-
-    return tensor_format
-
-
 def dims_to_channels_first(channels_last_dimensions: List[int]) -> List[int]:
-    """Convert a list of ints which represent dimensions in the channels last (TFLite) format to the channels first
-    (ONNX) format.
+    """Convert a list of ints which represent dimensions in the channels last (NeutronIR) format to the channels first
+    (ExecuTorch) format.
     """
     assert len(channels_last_dimensions) > 0, "Dimensions list is empty!"
 
@@ -122,8 +102,8 @@ def dims_to_channels_first(channels_last_dimensions: List[int]) -> List[int]:
 
 
 def dims_to_channels_last(channels_first_dimensions: List[int]) -> List[int]:
-    """Convert a list of ints which represent dimensions in the channels first (ONNX) format to the channels last
-    (TFLite) format.
+    """Convert a list of ints which represent dimensions in the channels first (ExecuTorch) format to the channels last
+    (NeutronIR) format.
     """
     assert len(channels_first_dimensions) > 0, "Dimensions list is empty!"
 
@@ -171,7 +151,7 @@ def _same_upper_equals_same_lower(
     o_strides: Optional[List[int]] = None,
     o_dilations: Optional[List[int]] = None,
 ) -> bool:
-    """Determine if in a given particular setting, the values of the ONNX `auto_pads` attribute SAME_UPPER and
+    """Determine if in a given particular setting, the values of the ExecuTorch `auto_pads` attribute SAME_UPPER and
     SAME_LOWER represent the exact same padding.
     """
 
@@ -193,7 +173,7 @@ def _tflite_padding_compute_output_size(
     """
     Calculates the output shape of the tensor with particular setting as tflite would. Implementation corresponds to
     tensorflow/lite/kernels/padding.h:ComputeOutSize()
-    :param padding: TFLite Padding value - 'Same' or 'Valid'
+    :param padding: NeutronIR Padding value - 'Same' or 'Valid'
     :param tflite_spatial_input_shape: input tensor shape
     :param tflite_kernel_shape: convolution kernel shape
     :param strides: strides (default is 1)
@@ -229,7 +209,7 @@ def tflite_compute_padding_with_offset(
     dilations: Optional[List[int]] = None,
 ) -> (List[int], List[int]):
     """
-    Calculate padding and offset for each dimension for particular convolution setting as TFLite.
+    Calculate padding and offset for each dimension for particular convolution setting as NeutronIR.
     Implementation corresponds to tensorflow/lite/kernels/padding.h:ComputePaddingWithOffset()
     :param tflite_input_shape: tensorflow lite input shape
     :param tflite_kernel_shape: tensorflow lite kernel shape
@@ -272,14 +252,14 @@ def _is_same_padding(
     o_strides: Optional[List[int]] = None,
     o_dilations: Optional[List[int]] = None,
 ) -> bool:
-    """Determine if given ONNX 'pads' padding can be represented exactly with the TFLite 'SAME' padding type.
-
-    :param o_pads: ONNX 'pads' attribute.
-    :param tflite_input_shape: The shape of the main input of the operator in TFLite format.
-    :param tflite_output_shape: The shape of the main output of the operator in TFLite format.
-    :param o_kernel_shape: ONNX 'kernel_shape' attribute.
-    :param o_strides: ONNX 'strides' attribute. Can be omitted.
-    :param o_dilations: ONNX 'dilations' attribute. Can be omitted.
+    """Determine if given ExecuTorch 'pads' padding can be represented exactly with the NeutronIR 'SAME' padding type.
+
+    :param o_pads: ExecuTorch 'pads' attribute.
+    :param tflite_input_shape: The shape of the main input of the operator in NeutronIR format.
+    :param tflite_output_shape: The shape of the main output of the operator in NeutronIR format.
+    :param o_kernel_shape: ExecuTorch 'kernel_shape' attribute.
+    :param o_strides: ExecuTorch 'strides' attribute. Can be omitted.
+    :param o_dilations: ExecuTorch 'dilations' attribute. Can be omitted.
     """
 
     if len(tflite_input_shape) == 0 or len(tflite_output_shape) == 0:
@@ -289,7 +269,7 @@ def _is_same_padding(
             f"'{tflite_input_shape}' and output shape '{tflite_output_shape}'.",
         )
 
-    # Calculate if the output shape corresponds to Same padding setting in TFLite
+    # Calculate if the output shape corresponds to Same padding setting in NeutronIR
     tflite_spatial_input_shape = tflite_input_shape[1:-1]
     tmp_spatial_output_shape = _tflite_padding_compute_output_size(
         tflPadding.Padding.SAME,
@@ -302,10 +282,10 @@ def _is_same_padding(
         return False
 
     # For every dimension, the padding is added to the start and end of the dimension.
-    # TFLite padding 'SAME' tries to split it evenly, but in case of odd padding, 'SAME' adds the excess 1 at the end.
-    # TFLite represents this in the offset. The offset is added to the end of particular dimension,
+    # NeutronIR padding 'SAME' tries to split it evenly, but in case of odd padding, 'SAME' adds the excess 1 at the end.
+    # NeutronIR represents this in the offset. The offset is added to the end of particular dimension,
     # i.e. bottom for H dim, right for W dim and so on.
-    # ONNX represents this in 'pads' as [x1_begin, x2_begin,... , x1_end, x2_end,...].
+    # ExecuTorch represents this in 'pads' as [x1_begin, x2_begin,... , x1_end, x2_end,...].
     padding, offset = tflite_compute_padding_with_offset(
         tflite_input_shape, o_kernel_shape, tflite_output_shape, o_strides, o_dilations
     )
@@ -319,30 +299,6 @@ def _is_same_padding(
     return True
 
 
-def permutations_are_inverse(
-    permutation1: Sequence[int], permutation2: Sequence[int]
-) -> bool:
-    """Determine if given Transpose permutations are inverse of each other.
-    i.e. when applied back to back, there will be no effect.
-
-    Example:
-      0 3 1 2
-      0 2 3 1
-    """
-
-    if len(permutation1) != len(permutation2):
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "translator.permutations_are_inverse(): permutations have different size!",
-        )
-
-    for i, perm2 in enumerate(permutation2):
-        if i != permutation1[perm2]:
-            return False
-
-    return True
-
-
 def combine_permutations(
     permutation1: Sequence[int], permutation2: Sequence[int]
 ) -> List[int]:
@@ -375,31 +331,35 @@ def shape_from_numpy(numpy_array):
     return tflite_model.Shape(dims)
 
 
-def onnx_explicit_padding_to_tflite(onnx_pads: list[int]) -> list[list[int]]:
-    """Convert the attribute or input 'pads' of the ONNX 'Pad' operator to the 'paddings' input of the TFLite 'Pad'
+def executorch_explicit_padding_to_tflite(
+    executorch_pads: list[int],
+) -> list[list[int]]:
+    """Convert the attribute or input 'pads' of the ExecuTorch 'Pad' operator to the 'paddings' input of the NeutronIR 'Pad'
      class of operators.
 
     This function does NOT take tensor formats into consideration.
     """
 
-    start_padding = onnx_pads[
-        : len(onnx_pads) // 2
+    start_padding = executorch_pads[
+        : len(executorch_pads) // 2
     ]  # Padding at the start of each dimension
-    end_padding = onnx_pads[
-        len(onnx_pads) // 2 :
+    end_padding = executorch_pads[
+        len(executorch_pads) // 2 :
     ]  # Padding at the end of each dimension
 
     return list(zip(start_padding, end_padding))
 
 
-def onnx_pads_to_tflite_explicit_padding(onnx_pads: List[int]) -> List[List[int]]:
-    """Convert an ONNX attribute 'pads' of operators such as Conv, MaxPool or AveragePool, to a list of ints which is
-    compatible with the TFLite 'Pad' operator.
+def executorch_pads_to_tflite_explicit_padding(
+    executorch_pads: List[int],
+) -> List[List[int]]:
+    """Convert an ExecuTorch attribute 'pads' of operators such as Conv, MaxPool or AveragePool, to a list of ints which is
+    compatible with the NeutronIR 'Pad' operator.
     """
 
-    tflite_padding = onnx_explicit_padding_to_tflite(onnx_pads)
+    tflite_padding = executorch_explicit_padding_to_tflite(executorch_pads)
 
-    # TFLite also allows padding to the 'batch' and 'channels'. ONNX does not
+    # NeutronIR also allows padding to the 'batch' and 'channels'. ExecuTorch does not
     tflite_padding.insert(0, [0, 0])
     tflite_padding.append([0, 0])
 
@@ -413,15 +373,15 @@ def _get_explicit_tflite_padding_for_same_lower(
     o_strides: Optional[List[int]] = None,
     o_dilations: Optional[List[int]] = None,
 ) -> List[List[int]]:
-    """Get the TFLite explicit padding required to represent ONNX 'SAME_LOWER' auto_pad for a particular setting.
+    """Get the NeutronIR explicit padding required to represent ExecuTorch 'SAME_LOWER' auto_pad for a particular setting.
 
-    :param tflite_input_shape: TFLite (NHWC) shape of the input tensor of the operator.
-    :param tflite_output_shape: TFLite (NHWC) shape of the output tensor of the operator.
-    :param o_kernel_shape: ONNX 'kernel_shape' attribute.
-    :param o_strides: Optional ONNX 'o_strides' attribute.
-    :param o_dilations: Optional ONNX 'o_dilations' attribute.
+    :param tflite_input_shape: NeutronIR (NHWC) shape of the input tensor of the operator.
+    :param tflite_output_shape: NeutronIR (NHWC) shape of the output tensor of the operator.
+    :param o_kernel_shape: ExecuTorch 'kernel_shape' attribute.
+    :param o_strides: Optional ExecuTorch 'o_strides' attribute.
+    :param o_dilations: Optional ExecuTorch 'o_dilations' attribute.
 
-    :return: A TFLite style explicit padding, compatible with the TFLite 'Pad' operator.
+    :return: A NeutronIR style explicit padding, compatible with the NeutronIR 'Pad' operator.
     """
 
     padding, offset = tflite_compute_padding_with_offset(
@@ -433,102 +393,15 @@ def _get_explicit_tflite_padding_for_same_lower(
     ]  # In case of odd padding, the excess is added at the start
     end_padding = padding
 
-    onnx_explicit_padding = start_padding + end_padding
-
-    # Return explicit ONNX padding converted to TFLite padding
-    return onnx_pads_to_tflite_explicit_padding(onnx_explicit_padding)
-
-
-def convert_padding(
-    o_auto_pad: str,
-    o_pads: List[int],
-    tflite_input_shape: List[int],
-    tflite_output_shape: List[int],
-    o_kernel_shape: List[int],
-    o_strides: Optional[List[int]],
-    o_dilations: Optional[List[int]] = None,
-) -> Tuple[tflPadding.Padding, Optional[List[List[int]]]]:
-    """Convert ONNX operator attributes 'pads' and 'auto_pad' to TFLite.
-
-    :param o_auto_pad: ONNX operator attribute 'auto_pad'
-    :param o_pads: ONNX operator attribute 'pads'
-    :param tflite_input_shape: The shape of the main input tensor in the TFLite format.
-    :param tflite_output_shape: The shape of the main output tensor in the TFLite format.
-    :param o_kernel_shape: ONNX operator attribute 'kernel_shape'
-    :param o_strides: ONNX operator attribute 'strides'
-    :param o_dilations: ONNX operator attribute 'dilations'
-
-    :return: A tuple.
-                The first element is the converted TFLite padding.
-                The second is None, if conversion is finished. Or it is a list of ints representing the explicit
-                padding in TFLite format (compatible with the 'Pad' operator), which needs to be provided by a
-                'Pad' operator. Caller must add this operator using model_builder!
-    """
-
-    if o_auto_pad == "SAME_UPPER":
-        return tflPadding.Padding.SAME, None
-
-    elif o_auto_pad == "SAME_LOWER":
-        if _same_upper_equals_same_lower(
-            tflite_input_shape,
-            tflite_output_shape,
-            o_kernel_shape,
-            o_strides,
-            o_dilations,
-        ):
-            return tflPadding.Padding.SAME, None
-
-        else:
-            logger.d(
-                "'SAME_LOWER' auto_pad cannot be exactly represented in TFLite as padding 'SAME' or 'VALID'. "
-                "Inserting an extra 'Pad' operator."
-            )
-            tflite_explicit_padding = _get_explicit_tflite_padding_for_same_lower(
-                tflite_input_shape,
-                tflite_output_shape,
-                o_kernel_shape,
-                o_strides,
-                o_dilations,
-            )
-            return tflPadding.Padding.VALID, tflite_explicit_padding
-
-    elif o_auto_pad == "VALID":
-        return tflPadding.Padding.VALID, None
-
-    # auto_pad is NOTSET -> use explicit padding
-    elif o_pads is None or all(val == 0 for val in o_pads):
-        # No padding in any direction
-        return tflPadding.Padding.VALID, None
-
-    elif _is_same_padding(
-        o_pads,
-        tflite_input_shape,
-        tflite_output_shape,
-        o_kernel_shape,
-        o_strides,
-        o_dilations,
-    ):
-        # Explicit padding can be represented with TFLite 'SAME' padding.
-        return tflPadding.Padding.SAME, None
-
-    else:
-        # 'pads' cannot be converted directly. Return 'VALID' and the required explicit padding and caller must
-        # implement conversion by adding a 'Pad' operator.
-
-        logger.d(
-            "Explicit ONNX 'pads' cannot be represented directly as 'SAME' or 'VALID'. "
-            "Inserting an extra 'Pad' operator."
-        )
-
-        # ONNX 'pads' uses different format than TFLite 'Pad' operator. Convert the explicit padding.
-        tflite_explicit_padding = onnx_pads_to_tflite_explicit_padding(o_pads)
+    executorch_explicit_padding = start_padding + end_padding
 
-        return tflPadding.Padding.VALID, tflite_explicit_padding
+    # Return explicit ExecuTorch padding converted to NeutronIR padding
+    return executorch_pads_to_tflite_explicit_padding(executorch_explicit_padding)
 
 
 def convert_data_to_channels_first(array: np.ndarray) -> np.ndarray:
-    """Convert a numpy array representing the data of a tensor from the channels last format (TFLite), to channels
-        first format (ONNX).
+    """Convert a numpy array representing the data of a tensor from the channels last format (NeutronIR), to channels
+        first format (ExecuTorch).
 
     :param array: Numpy array holding the tensor's data.
     :return: The transformed data.
@@ -543,8 +416,8 @@ def convert_data_to_channels_first(array: np.ndarray) -> np.ndarray:
 
 
 def convert_data_to_channels_last(array: np.ndarray) -> np.ndarray:
-    """Convert a numpy array representing the data of a tensor from the channels first format (ONNX), to channels last
-        format (TFLite).
+    """Convert a numpy array representing the data of a tensor from the channels first format (ExecuTorch), to channels last
+        format (NeutronIR).
 
     :param array: Numpy array holding the tensor's data.
     :return: The transformed data.
@@ -558,17 +431,6 @@ def convert_data_to_channels_last(array: np.ndarray) -> np.ndarray:
     return np.moveaxis(array, 1, -1)  # Move the second axis (C), to the end
 
 
-def channels_first_shape_to_channels_last(
-    channels_first_shape: tflite_model.Shape,
-) -> tflite_model.Shape:
-    """Create a channels last version of a channels first 'tflite_model.Shape' object."""
-
-    dims = channels_first_shape.vector.copy()
-    dims = dims_to_channels_last(dims)
-
-    return tflite_model.Shape(dims)
-
-
 def channels_last_shape_to_channels_first(
     nhwc_shape: tflite_model.Shape,
 ) -> tflite_model.Shape:
@@ -580,23 +442,13 @@ def channels_last_shape_to_channels_first(
     return tflite_model.Shape(dims)
 
 
-def convert_onnx_dimensions_to_tflite_shape(o_dims: List[int]) -> tflite_model.Shape:
-    """Convert list of ints representing the shape of an ONNX channels first Tensor to a TFLite 'Shape' object."""
-
-    dims = list(o_dims)  # Copy just in case
-
-    dims = dims_to_channels_last(dims)
-
-    return tflite_model.Shape(dims)
-
-
 def create_channels_last_to_channels_first_permutation(
     rank: int, return_list: bool = False
 ) -> np.ndarray | list[int]:
     """Return a numpy array with data that describes the permutation, which would change a tensor from the channels
-    last (TFLite) format to the channels first (ONNX) format.
+    last (NeutronIR) format to the channels first (ExecuTorch) format.
 
-    This permutation is compatible with the TFLite `Transpose` operator.
+    This permutation is compatible with the NeutronIR `Transpose` operator.
 
     :param rank: The rank of the required permutation.
     :param return_list: If True, the function returns a list of ints. If False, a numpy array is returned.
@@ -615,9 +467,9 @@ def create_channels_first_to_channels_last_permutation(
     rank: int, return_list: bool = False
 ) -> np.ndarray | list[int]:
     """Return a numpy array with data that describes the permutation, which would change a tensor from the channels
-    first (ONNX) format to the channels last (TFLite) format.
+    first (ExecuTorch) format to the channels last (NeutronIR) format.
 
-    This permutation is compatible with the TFLite `Transpose` operator.
+    This permutation is compatible with the NeutronIR `Transpose` operator.
 
     :param rank: The rank of the required permutation.
     :param return_list: If True, the function returns a list of ints. If False, a numpy array is returned.
@@ -632,35 +484,8 @@ def create_channels_first_to_channels_last_permutation(
         return np.asarray(perm, np.int32)
 
 
-def create_axis_to_last_perm(axis, num_dims):
-    """Create a numpy array representing the transpose permutations needed, to
-    make the 'axis' dimension, the last dimension.
-    """
-
-    dims = list(range(num_dims))
-
-    if axis == num_dims - 1:
-        return dims
-    elif axis >= num_dims or axis < 0:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            f"translator.create_axis_to_last_perm({axis},{num_dims}). Inputs don't make sense!",
-        )
-
-    # Remember axis dimension
-    axis_dim = dims[axis]
-
-    # Move dimensions after 'axis' to the left
-    dims[axis:-1] = dims[axis + 1 : -1]
-
-    # Add axis dimension to the end
-    dims.append(axis_dim)
-
-    return np.asarray(dims, np.int32)
-
-
 def apply_permutation_to(target: List[Any], permutation: Collection[int]) -> List:
-    """Permute a list according to a permutation. Uses the same permutation format as the TFLite Transpose operator.
+    """Permute a list according to a permutation. Uses the same permutation format as the NeutronIR Transpose operator.
 
     :param target: A list of any types, to permute. Must be same size as the permutation.
     :param permutation: The permutation to apply to the target.
@@ -678,7 +503,7 @@ def apply_permutation_to(target: List[Any], permutation: Collection[int]) -> Lis
 
 def create_inverse_permutation(permutation: List[int]) -> List[int]:
     """Create and return a permutation, that is the inverse of the given 'permutation' parameter.
-        Uses the same permutation format as the TFLite Transpose operator.
+        Uses the same permutation format as the NeutronIR Transpose operator.
 
     :param permutation: The permutation to create the inverse of.
     :return: Inverse permutation.
@@ -694,38 +519,8 @@ def create_inverse_permutation(permutation: List[int]) -> List[int]:
     return [permutation.index(perm) for perm in range(len(permutation))]
 
 
-def get_max_value_for_type(dtype: np.dtype) -> any:
-    """Return the maximum possible value for given numpy type."""
-    if dtype.kind in ("i", "u"):
-        return np.iinfo(dtype).max
-
-    elif dtype.kind == "f":
-        return np.finfo(dtype).max
-
-    else:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            f"translator.get_max_value_for_type(): unexpected type {dtype.name}.",
-        )
-
-
-def get_min_value_for_type(dtype: np.dtype) -> any:
-    """Return the minimum possible value for given numpy type."""
-    if dtype.kind in ("i", "u"):
-        return np.iinfo(dtype).min
-
-    elif dtype.kind == "f":
-        return np.finfo(dtype).min
-
-    else:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            f"translator.get_min_value_for_type(): unexpected type {dtype.name}.",
-        )
-
-
 def convert_data_type(torch_type: torch.TensorType) -> TensorType:
-    """Convert Torch DataType to TFLite TensorType"""
+    """Convert Torch DataType to NeutronIR TensorType"""
 
     if torch_type == torch.float32:
         return TensorType.FLOAT32
@@ -753,7 +548,7 @@ def convert_data_type(torch_type: torch.TensorType) -> TensorType:
 
 
 def torch_type_to_numpy_type(torch_type: torch.TensorType) -> np.ScalarType:
-    """Convert Torch DataType to TFLite TensorType"""
+    """Convert Torch DataType to NeutronIR TensorType"""
 
     if torch_type == torch.float32:
         return np.dtype(np.float32)
@@ -778,10 +573,10 @@ def torch_type_to_numpy_type(torch_type: torch.TensorType) -> np.ScalarType:
 
 
 def numpy_type_to_tf_lite(numpy_type: np.dtype) -> TensorType:  # noqa C901
-    """Convert the numpy data type to a corresponding TFLite 'TensorType'.
+    """Convert the numpy data type to a corresponding NeutronIR 'TensorType'.
 
     :param numpy_type: Numpy dtype to convert.
-    :return: Corresponding TFLite TensorType.
+    :return: Corresponding NeutronIR TensorType.
     """
     numpy_type = numpy_type.type
 
@@ -835,12 +630,12 @@ def numpy_type_to_tf_lite(numpy_type: np.dtype) -> TensorType:  # noqa C901
     else:
         logger.e(
             logger.Code.CONVERSION_IMPOSSIBLE,
-            f"Cannot convert numpy data type '{numpy_type}' to TFLite.",
+            f"Cannot convert numpy data type '{numpy_type}' to NeutronIR.",
         )
 
 
 def tf_lite_type_to_numpy(tfl_type: TensorType) -> np.ScalarType:  # noqa C901
-    """Convert TFLite TensorType to numpy dtype"""
+    """Convert NeutronIR TensorType to numpy dtype"""
 
     if tfl_type == TensorType.FLOAT32:
         return np.dtype(np.float32)
@@ -890,72 +685,5 @@ def tf_lite_type_to_numpy(tfl_type: TensorType) -> np.ScalarType:  # noqa C901
     else:
         logger.e(
             logger.Code.CONVERSION_IMPOSSIBLE,
-            f"Cannot convert TFLite type '{tfl_type}' to numpy dtype.",
+            f"Cannot convert NeutronIR type '{tfl_type}' to numpy dtype.",
         )
-
-
-def tflite_type_to_tensor_flow_data_type(tfl_type: TensorType) -> TensorFlowDataType:
-    """Convert TFLite TensorType to the internal type of TensorFlow."""
-    match tfl_type:
-        case TensorType.FLOAT16:
-            # There seems to be no counterpart in the TF DataType.
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                "tflite_type_to_tensor_flow_data_type(): float16.",
-            )
-        case TensorType.FLOAT32:
-            return TensorFlowDataType.DT_FLOAT.value
-        case TensorType.FLOAT64:
-            return TensorFlowDataType.DT_DOUBLE.value
-
-        case TensorType.INT4:
-            return TensorFlowDataType.DT_INT4.value
-        case TensorType.INT8:
-            return TensorFlowDataType.DT_INT8.value
-        case TensorType.INT16:
-            return TensorFlowDataType.DT_INT16.value
-        case TensorType.INT32:
-            return TensorFlowDataType.DT_INT32.value
-        case TensorType.INT64:
-            return TensorFlowDataType.DT_INT64.value
-
-        case TensorType.UINT8:
-            return TensorFlowDataType.DT_UINT8.value
-        case TensorType.UINT16:
-            return TensorFlowDataType.DT_UINT16.value
-        case TensorType.UINT32:
-            return TensorFlowDataType.DT_UINT32.value
-        case TensorType.UINT64:
-            return TensorFlowDataType.DT_UINT64.value
-
-        case TensorType.COMPLEX64:
-            return TensorFlowDataType.DT_COMPLEX64.value
-        case TensorType.COMPLEX128:
-            return TensorFlowDataType.DT_COMPLEX128.value
-
-        case TensorType.STRING:
-            return TensorFlowDataType.DT_STRING.value
-
-        case TensorType.BOOL:
-            return TensorFlowDataType.DT_BOOL.value
-
-        case TensorType.RESOURCE:
-            return TensorFlowDataType.DT_RESOURCE.value
-        case TensorType.VARIANT:
-            return TensorFlowDataType.DT_VARIANT.value
-
-        case _:
-            # All TFLite types are covered. Must be an invalid type.
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                f"tflite_type_to_tensor_flow_data_type(): invalid TFLite type `{tfl_type}`.",
-            )
-
-
-def infer_kernel_shape(weight_tensor: tflite_model.Tensor) -> list[int]:
-    """Returns the kernel shape inferred from the weight tensor.
-
-    Weight tensors shape expected in TFlite Format, where the 0th index is output channels count, last is input channels
-    count.
-    """
-    return weight_tensor.shape.vector[1:-1]
diff --git a/backends/nxp/backend/ir/converter/node_converter.py b/backends/nxp/backend/ir/converter/node_converter.py
index ed624aaa411..36266486aac 100755
--- a/backends/nxp/backend/ir/converter/node_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converter.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from enum import Enum
 
 import torch
 
@@ -16,8 +15,10 @@
     AtenModelBuilderDirector,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx import Node
+from torch.fx.passes.infra.partitioner import Partition
 from torch.nn import Parameter
 
 
@@ -37,15 +38,8 @@ def _is_dequant_node(node: torch.fx.Node) -> bool:
     ]
 
 
-class Target(Enum):
-    IGNORE = "ignore"  # No target platform. Any target specific restrictions will be ignored.
-
-    RT700 = "imxrt700"
-    IMX95 = "imx95"
-
-    @classmethod
-    def values(cls) -> list[str]:
-        return [elt.value for elt in cls]
+def is_not_qdq_node(node: torch.fx.Node) -> bool:
+    return not (_is_quant_node(node) or _is_dequant_node(node))
 
 
 class NodeConverter(ABC):
@@ -89,7 +83,7 @@ def _is_supported_in_IR(
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
@@ -98,33 +92,50 @@ def _is_supported_on_target(
             can be used by operators with no target specific requirements.
 
         :param node: The node (edge operator) to check.
-        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param parameters_mapping: Dictionary mapping tensor names to their static data (if they have it).
         :param custom_delegation_options: Custom options which affect delegation.
         """
-        return target == Target.RT700
+        return True
 
     @classmethod
     def is_supported(
         cls,
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
         """Check if the given `node` is supported in the IR and on the given `target` platform.
 
         :param node: torch.Node to check.
-        :param target: Value of the `Target` enum representing the target platform to check for.
+        :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param parameters_mapping: Dict mapping tensor names to their data.
         :param custom_delegation_options: Custom user options which affect node delegation.
         """
         return cls._is_supported_in_IR(
             node, parameters_mapping, custom_delegation_options
         ) and cls._is_supported_on_target(
-            node, target, parameters_mapping, custom_delegation_options
+            node, neutron_target_spec, parameters_mapping, custom_delegation_options
         )
 
+    @classmethod
+    def supports_partitioning_result(
+        cls,
+        node: Node,
+        partition_list: list[Partition],
+        custom_delegation_options: CustomDelegationOptions,
+    ):
+        """Check if the given `node` supports the assigned partitioning, which is stored  the `partition_list`. Child
+            classes can overwrite this method in case they have delegation restrictions based on the context defined by
+            the partitioning result.
+
+        :param node: torch.Node to check.
+        :param partition_list: List of proposed partitions.
+        :param custom_delegation_options: Custom user options which affect node delegation.
+        """
+        return True
+
     @staticmethod
     def _has_shared_q_params_if_quantized(node: Node) -> bool:
         """Check if node has shared quantization parameters if it's quantized."""
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
index d1674e16a9f..3cf70f46b8d 100755
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/__init__.py
@@ -41,7 +41,8 @@
     PermuteCopyConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_dequantize_converter import (
-    QDQDequantizeConverter,
+    QDQPerChannelDequantizeConverter,
+    QDQPerTensorDequantizeConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.qdq_quantize_converter import (
     QDQQuantizeConverter,
@@ -55,6 +56,9 @@
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.softmax_converter import (
     SoftmaxConverter,
 )
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.sub_tensor_converter import (
+    SubTensorConverter,
+)
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters.tanh_converter import (
     TanhConverter,
 )
@@ -70,7 +74,8 @@
     "PermuteCopyConverter",
     "SoftmaxConverter",
     "ViewCopyConverter",
-    "QDQDequantizeConverter",
+    "QDQPerTensorDequantizeConverter",
+    "QDQPerChannelDequantizeConverter",
     "QDQQuantizeConverter",
     "ConstantPadNDConverter",
     "ReLUConverter",
@@ -78,6 +83,7 @@
     "MaxPool2dConverter",
     "AvgPool2dConverter",
     "AddTensorConverter",
+    "SubTensorConverter",
     "CloneConverter",
     "AbsConverter",
     "AdaptiveAvgPool2dConverter",
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
index c74baa61f67..cd5aa2ead81 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py
@@ -9,11 +9,11 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     add_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -22,20 +22,15 @@ class AddTensorConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                if node_uses_shape_broadcasting(node):
-                    # Shape broadcasting may require the addition of `Transpose` ops during conversion.
-                    return False
-
-                return True
+        if node_uses_shape_broadcasting(node):
+            # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+            return False
 
-            case _:
-                return False
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
index 4f7f00fe5ba..22ca258cd4f 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/cat_converter.py
@@ -13,11 +13,11 @@
     _is_dequant_node,
     _is_quant_node,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.concatenation_options import (
     Concatenation,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -72,51 +72,52 @@ def _all_io_shares_quantization_parameters(node: Node) -> bool:
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
         if custom_delegation_options.force_delegate_cat:
             return True
 
-        match target:
-            case Target.RT700:
-                dim = CatConverter._get_normalized_dim(node)
-
-                # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
-                if dim == 0:
-                    return False
-
-                # Neutron requires the channels to be a multiple of `8`. The channels could either be the second or the
-                #  last dimension, depending on the formats of the node. The format, however, cannot be determined
-                #  during conversion, as it depends on what other nodes are delegated.
-                input_channels = [
-                    # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
-                    #  will still be the channels in the IR.
-                    _get_shape(input_)[1]
-                    for input_ in node.all_input_nodes
-                ] + [
-                    # If the inputs/outputs are channels first, the last dimension will be the channels.
-                    _get_shape(input_)[-1]
-                    for input_ in node.all_input_nodes
-                ]
-                if any((input_channel % 8) != 0 for input_channel in input_channels):
-                    # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
-                    return False
-
-                output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
-                # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
-                if any((out_c % 8) != 0 for out_c in output_channels):
-                    return False
-
-                if len(node.all_input_nodes) < 2:  # Not supported on Neutron
-                    # TODO Try to skip the operator if this case is realistic.
-                    return False
-
-                return True
-
-            case _:
-                return False
+        dim = CatConverter._get_normalized_dim(node)
+
+        # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1491
+        if dim == 0:
+            return False
+
+        # Neutron requires the channels to be a multiple of numMacs. The channels could either be the second or the
+        #  last dimension, depending on the formats of the node. The format, however, cannot be determined
+        #  during conversion, as it depends on what other nodes are delegated.
+        input_channels = [
+            # The second dimension is the channels in PyTorch. If the inputs/output are not channels first, it
+            #  will still be the channels in the IR.
+            _get_shape(input_)[1]
+            for input_ in node.all_input_nodes
+        ] + [
+            # If the inputs/outputs are channels first, the last dimension will be the channels.
+            _get_shape(input_)[-1]
+            for input_ in node.all_input_nodes
+        ]
+        if any(
+            (input_channel % neutron_target_spec.get_num_macs()) != 0
+            for input_channel in input_channels
+        ):
+            # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1492
+            return False
+
+        output_channels = [_get_shape(node)[1], _get_shape(node)[-1]]
+        # neutron-library/src/utils/NeutronLibraryInterrogation.cpp#1493
+        if any(
+            (out_c % neutron_target_spec.get_num_macs()) != 0
+            for out_c in output_channels
+        ):
+            return False
+
+        if len(node.all_input_nodes) < 2:  # Not supported on Neutron
+            # TODO Try to skip the operator if this case is realistic.
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
index 1d370ab8c48..17b2cee9874 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/clone_converter.py
@@ -20,6 +20,11 @@ def _has_supported_memory_format(node: Node) -> bool:
 
 
 class CloneConverter(NodeConverter):
+    """
+    This converter is responsible for converting both edge operators:
+    - aten.clone.default
+    - dim_order_ops._clone_dim_order.default
+    """
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
index f58df1a88d9..499541aa58c 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/constant_pad_nd_converter.py
@@ -17,7 +17,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.quantization_utils import (
     quantize_int8,
@@ -27,6 +26,7 @@
     pad_options,
     pad_v2_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -35,22 +35,16 @@ class ConstantPadNDConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # TODO: Consider different tensor formats (dim-order)
-                paddings = node.args[1]
-                if len(paddings) > 4 and paddings[4:6] != [0, 0]:
-                    # Attempt to Pad channels dimension, which is not supported on Neutron.
-                    return False
-
-                return True
-
-            case _:
-                return False
+        paddings = node.args[1]
+        if len(paddings) > 4 and paddings[4:6] != [0, 0]:
+            # Attempt to Pad channels dimension, which is not supported on Neutron.
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
index 0f3a4b9bb5a..f32b5a65cac 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/convolution_converter.py
@@ -25,7 +25,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared import (
     conv_utils,
@@ -45,6 +44,7 @@
     depthwise_conv_2d_options,
     reshape_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -53,45 +53,38 @@ class ConvolutionConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                activations = node.args[0]
-                weights = node.args[1]
-                groups = node.args[8]
-
-                if activations.meta["val"].shape[0] != 1:
-                    # Only batch size 1 is supported on neutron.
-                    return False
-
-                if groups == 1:  # Regular convolution.
-                    pass
-                elif conv_utils.group_conv_convertible_as_depthwise(
-                    node, groups
-                ):  # Depthwise convolution.
-                    # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
-                    #  weights. In case the weights are dynamic, a Transpose operator would have to be added, which
-                    #  is not supported on Neutron.
-                    if not node_is_effectively_static_tensor(
-                        weights, parameters_mapping
-                    ):
-                        return False
-                elif conv_utils.group_conv_convertible_into_multiple_convolutions(
-                    node, groups
-                ):  # Separable conv. This should never be reached, as the node should have been decomposed into
-                    #  multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
-                    logging.warning("Group convolution was not decomposed.")
-                    return False
-                else:  # Unexpected case (should never happen).
-                    return False
-
-                return True
-
-            case _:
+        activations = node.args[0]
+        weights = node.args[1]
+        groups = node.args[8]
+
+        if activations.meta["val"].shape[0] != 1:
+            # Only batch size 1 is supported on neutron.
+            return False
+
+        if groups == 1:  # Regular convolution.
+            pass
+        elif conv_utils.group_conv_convertible_as_depthwise(
+            node, groups
+        ):  # Depthwise convolution.
+            # Only supported if the weights are static, because TFLite `DepthwiseConv2D` uses permuted
+            #  weights. In case the weights are dynamic, a Transpose operator would have to be added, which
+            #  is not supported on Neutron.
+            if not node_is_effectively_static_tensor(weights, parameters_mapping):
                 return False
+        elif conv_utils.group_conv_convertible_into_multiple_convolutions(
+            node, groups
+        ):  # Separable conv. This should never be reached, as the node should have been decomposed into
+            #  multiple parallel convolutions by the `SplitGroupConvolution` pre-processing pass.
+            logging.warning("Group convolution was not decomposed.")
+            return False
+        else:  # Unexpected case (should never happen).
+            return False
+
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
@@ -238,7 +231,7 @@ def _convert_1d_conv(
     def _convert_unpadded_2D(
         self, t_op: tflite_model.Operator, conv_params: ConvParameters
     ) -> conv_utils.ConvConversionResult:
-        """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converter by the
+        """Convert the `aten.convolution` into TFLite. The `padding` and `builtin_options` must be converted by the
         caller.
         """
         common.assign_2d_strides(t_op.builtin_options, conv_params.stride)
@@ -321,6 +314,10 @@ def _convert_2d_conv(
                 t_op.tmp_inputs[1] = self.builder.create_transposed_tensor(
                     weight_tensor, perm
                 )
+
+                if t_op.tmp_inputs[1].quantization is not None:
+                    # Model is quantized
+                    t_op.tmp_inputs[1].quantization.quantized_dimension = 3
             else:
                 raise NotImplementedError("Dynamic Depthwise Conv weights.")
 
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
index f03c403876f..c1dd7b600be 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py
@@ -12,7 +12,6 @@
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
     NodeConverter,
-    Target,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reduce_utils import (
     convert_axes_from_attribute,
@@ -20,6 +19,7 @@
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     mean_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -28,34 +28,20 @@ class MeanDimConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # TODO: Consider different tensor formats (dim-order)
-                dim = node.args[1]
-                keepdim = node.args[2] if len(node.args) >= 3 else False
-                rank = len(node.args[0].meta["val"].shape)
-                dim = [MeanDimConverter._to_neg_dim(d, rank) for d in dim]
-
-                # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
-                if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
-                    return False
-
-                return True
-
-            case _:
-                return False
+        dim = node.args[1]
+        keepdim = node.args[2] if len(node.args) >= 3 else False
+        rank = len(node.args[0].meta["val"].shape)
+        dim = [d - rank if d > 0 else d for d in dim]
 
-    @staticmethod
-    def _to_pos_dim(d, rank):
-        return d + rank if d < 0 else d
+        # Only last 2 dimensions (H, W) and keepdim=True with rank=4 are supported on Neutron.
+        if rank != 4 or dim not in [[-1, -2], [-2, -1]] or not keepdim:
+            return False
 
-    @staticmethod
-    def _to_neg_dim(d, rank):
-        return d - rank if d > 0 else d
+        return True
 
     @staticmethod
     def _is_supported_in_IR(
@@ -75,6 +61,10 @@ def _is_supported_in_IR(
 
         return True
 
+    @staticmethod
+    def _to_pos_dim(d: int, rank: int):
+        return d + rank if d < 0 else d
+
     @staticmethod
     def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]:
         # convert negative index to positive
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
index c6ea7f90042..1d7c6b44627 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/qdq_dequantize_converter.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from abc import ABC, abstractmethod
 
 import numpy as np
 
@@ -19,7 +20,15 @@
 from torch.nn import Parameter
 
 
-class QDQDequantizeConverter(NodeConverter):
+class QDQDequantizeConverterBase(NodeConverter, ABC):
+
+    @abstractmethod
+    def get_zero_point(self, node: Node) -> np.ndarray:
+        pass
+
+    @abstractmethod
+    def get_scale(self, node: Node) -> np.ndarray:
+        pass
 
     @staticmethod
     def _is_supported_in_IR(
@@ -27,7 +36,7 @@ def _is_supported_in_IR(
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        zero_point_type = torch_type_to_numpy_type(node.args[5])
+        zero_point_type = torch_type_to_numpy_type(node.args[-1])
         if "cluster" not in node.meta or zero_point_type not in [np.int8, np.int32]:
             return False
 
@@ -39,10 +48,8 @@ def convert(self, node: Node):
         from_tensor = self.builder.tensor_for_name(node.name)
         to_tensor = self.builder.tensor_for_name(node.args[0].name)
 
-        zero_point_type = torch_type_to_numpy_type(node.args[5])
-
-        scale = np.array(node.args[1], dtype=np.float32)
-        zero_point = np.array(node.args[2], dtype=zero_point_type)
+        scale = self.get_scale(node)
+        zero_point = self.get_zero_point(node)
 
         if self.context.parameters_mapping.get(node.args[0].name, None) is None:
             # Convert dequantize as identity op (Transpose that will be removed) because
@@ -63,3 +70,22 @@ def convert(self, node: Node):
             # Change type so we pass check tensor similarity check when redirecting
             from_tensor.type = to_tensor.type
             self.builder.redirect_tensor(from_tensor, to_tensor)
+
+
+class QDQPerTensorDequantizeConverter(QDQDequantizeConverterBase):
+
+    def get_zero_point(self, node: Node) -> np.ndarray:
+        zero_point_type = torch_type_to_numpy_type(node.args[5])
+        return np.array(node.args[2], dtype=zero_point_type)
+
+    def get_scale(self, node: Node) -> np.ndarray:
+        return np.array(node.args[1], dtype=np.float32)
+
+
+class QDQPerChannelDequantizeConverter(QDQDequantizeConverterBase):
+
+    def get_zero_point(self, node: Node) -> np.ndarray:
+        return self.context.parameters_mapping[node.args[2].name].numpy()
+
+    def get_scale(self, node: Node) -> np.ndarray:
+        return self.context.parameters_mapping[node.args[1].name].numpy()
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
index aa74c78ca24..5e4404d8476 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/softmax_converter.py
@@ -7,13 +7,11 @@
     CustomDelegationOptions,
 )
 from executorch.backends.nxp.backend.edge_helper import input_rank
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    NodeConverter,
-    Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
 from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
     softmax_options,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.fx import Node
 from torch.nn import Parameter
 
@@ -22,18 +20,11 @@ class SoftmaxConverter(NodeConverter):
     @staticmethod
     def _is_supported_on_target(
         node: Node,
-        target: Target,
+        neutron_target_spec: NeutronTargetSpec,
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ) -> bool:
-        match target:
-            case Target.RT700:
-                # The eIQ Neutron NPU runtime software has a known issue with the SoftMax operation.
-                #  As long as the issue is present, return False for the i.MX RT700 target also.
-                return False
-
-            case _:
-                return False
+        return False
 
     @staticmethod
     def _is_supported_in_IR(
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
new file mode 100644
index 00000000000..e9522c87114
--- /dev/null
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py
@@ -0,0 +1,59 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.nxp.backend.ir.converter.conversion.common import (
+    node_uses_shape_broadcasting,
+)
+from executorch.backends.nxp.backend.ir.converter.node_converter import (
+    CustomDelegationOptions,
+    NodeConverter,
+)
+from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options import (
+    sub_options,
+)
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
+from torch.fx import Node
+from torch.nn import Parameter
+
+
+class SubTensorConverter(NodeConverter):
+    @staticmethod
+    def _is_supported_on_target(
+        node: Node,
+        neutron_target_spec: NeutronTargetSpec,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if node_uses_shape_broadcasting(node):
+            # Shape broadcasting may require the addition of `Transpose` ops during conversion.
+            return False
+
+        return True
+
+    @staticmethod
+    def _is_supported_in_IR(
+        node: Node,
+        parameters_mapping: dict[str, Parameter],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        if len(node.args) != 2:
+            return False
+
+        # The `alpha` attribute can be represented by adding an extra `Mul` operator.
+        #  However, this is not implemented as `alpha` is rarely used.
+        if hasattr(node.kwargs, "alpha"):
+            return False
+
+        return True
+
+    # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1)
+    def convert(self, node: Node):
+        """Convert 'sub_tensor' operator to NeutronIR 'Sub'."""
+        self.assert_convertible(node)
+
+        t_op = self._create_tflite_op_with_io_tensors(node)
+
+        t_op.builtin_options = sub_options.Sub()
+        self.builder.append_operators([t_op])
diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py
index 95a42d5d078..22eff3ebb5f 100644
--- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py
+++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/view_copy_converter.py
@@ -14,6 +14,7 @@
 from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
 from executorch.backends.nxp.backend.ir.converter.node_converter import (
     CustomDelegationOptions,
+    is_not_qdq_node,
     NodeConverter,
 )
 from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reshape_transposition import (
@@ -23,6 +24,7 @@
     reshape_options,
 )
 from torch.fx import Node
+from torch.fx.passes.infra.partitioner import Partition
 from torch.nn import Parameter
 
 
@@ -45,6 +47,27 @@ def _is_supported_in_IR(
 
         return True
 
+    @classmethod
+    def supports_partitioning_result(
+        cls,
+        node: Node,
+        partition_list: list[Partition],
+        custom_delegation_options: CustomDelegationOptions,
+    ):
+        view_copy_partitions = [
+            partition for partition in partition_list if node in partition.nodes
+        ]
+        assert len(view_copy_partitions) == 1
+        non_q_dq_partition_nodes = list(
+            filter(is_not_qdq_node, view_copy_partitions[0].nodes)
+        )
+
+        if len(non_q_dq_partition_nodes) == 1:
+            # The `view_copy` cannot be the only node in a partition.
+            return False
+
+        return True
+
     @staticmethod
     def _safe_compute_flat_size(shape: list[int | str]) -> int:
         """Compute the flat size of a tensor with given shape. Strings and negative dimensions are treated as '1'.
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py b/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py
index 50b9aef6d18..52b895d60cd 100755
--- a/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py
+++ b/backends/nxp/backend/ir/converter/node_converters/shared/recurrent_utils.py
@@ -1,19 +1,12 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from executorch.backends.nxp.backend.ir import logger
 from executorch.backends.nxp.backend.ir.converter.builder import model_builder
 from executorch.backends.nxp.backend.ir.converter.conversion import translator
-from executorch.backends.nxp.backend.ir.converter.conversion.common import (
-    OpsList,
-    try_get_input,
-)
+from executorch.backends.nxp.backend.ir.converter.conversion.common import OpsList
 from executorch.backends.nxp.backend.ir.converter.tensor_utils import tensor_has_data
-from executorch.backends.nxp.backend.ir.lib.tflite.ActivationFunctionType import (
-    ActivationFunctionType,
-)
 from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
 from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
 
@@ -25,12 +18,12 @@ def ensure_correct_tensor_formatting(
          or RNN operator.
 
         The LSTM/RNN may be using channels last tensors, because of the surrounding operators. LSTM/RNN requires its own
-         format, however I think the input tensors should be marked as 'FORMATLESS', because the main inputs of TFLite
-         and ONNX version of the operators have the same shape.
+         format, however I think the input tensors should be marked as 'FORMATLESS', because the main inputs of the
+         NeutronIR and the ExecuTorch version of the operators have the same shape.
         I believe that the cleanest and most robust way to solve this, is to mark LSTM/RNN as an operator which can
          change the formats of its tensors, and solve any format related issues in this module.
 
-    :param t_op: TFLite operator with inputs and outputs corresponding to the ONNX LSTM/RNN operator.
+    :param t_op: NeutronIR operator with inputs and outputs corresponding to the ExecuTorch LSTM/RNN operator.
     :param builder: ModelBuilder object.
     :param ops: OpsList object, with operators to add to the model. May already contain some operators.
     """
@@ -69,44 +62,3 @@ def ensure_correct_tensor_formatting(
             ops.post_ops.append(transpose)
 
             t_op.tmp_outputs[idx].tensor_format = TensorFormat.FORMATLESS
-
-
-def get_activation_function_for_name(
-    name: str, op_type: str = "LSTM"
-) -> ActivationFunctionType:
-    get_activation_function_for_name.map = {
-        "Tanh": ActivationFunctionType.TANH,
-        "Relu": ActivationFunctionType.RELU,
-    }
-
-    if act_fun := get_activation_function_for_name.map.get(name, None):
-        return act_fun
-
-    # Couldn't find a corresponding activation function
-    logger.e(
-        logger.Code.CONVERSION_IMPOSSIBLE,
-        f"Conversion of ONNX {op_type} with activation function '{name}' is not possible.",
-    )
-
-
-def check_sequence_lens(
-    t_op: tflite_model.Operator, seq_length: int, op_type: str = "LSTM"
-):
-    """Check if the 'sequence_lens' operand of ONNX LSTM/RNN has an effect. If it does, exit with error.
-
-    :param t_op: TFLite operator with inputs and outputs corresponding to the ONNX operator.
-    :param seq_length: The first dimension of the main LSTM input.
-    :param op_type: Operator type of 't_op'. Used only for printing a specific error message.
-    """
-    if sequence_lens := try_get_input(t_op, 4):
-        # 'sequence_lens' allows each sequence to have a different length. As far as I can tell, TFLite doesn't support
-        #  this.
-        if (not tensor_has_data(sequence_lens)) or any(
-            elt != seq_length for elt in sequence_lens.tmp_buffer.data
-        ):
-            # The 'sequence_lens' is either dynamic, or static with at least one value different from 'seq_length'.
-            # Conversion most likely impossible.
-            logger.e(
-                logger.Code.CONVERSION_IMPOSSIBLE,
-                f"Conversion of ONNX {op_type} with 'sequence_lens' input is not possible.",
-            )
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py b/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py
index 1dca3acea74..da92e359f1e 100755
--- a/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py
+++ b/backends/nxp/backend/ir/converter/node_converters/shared/reduce_utils.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import numpy as np
+
 from executorch.backends.nxp.backend.ir.converter.builder.model_builder import (
     ModelBuilder,
 )
@@ -16,7 +17,7 @@
 def convert_axes_from_attribute(
     t_op: tflite_model.Operator, builder: ModelBuilder, axes: list[int] | None
 ):
-    """Create an `axes` tensor and assign it as an input to the `t_op`, which is expected to represent an ONNX
+    """Create an `axes` tensor and assign it as an input to the `t_op`, which is expected to represent an ExecuTorch
     reduction operator.
     """
     x = t_op.tmp_inputs[0]
@@ -52,15 +53,15 @@ def ensure_reduce_transposition(builder, ops: OpsList):
     output_format = output_tensor.tensor_format
 
     if input_format.is_channels_last() and output_format.is_channels_last():
-        to_onnx_perm = translator.create_channels_last_to_channels_first_permutation(
-            input_rank
+        to_executorch_perm = (
+            translator.create_channels_last_to_channels_first_permutation(input_rank)
         )
         to_tflite_perm = translator.create_channels_first_to_channels_last_permutation(
             output_rank, return_list=True
         )
 
         transpose_before = builder.create_transpose_operator_before(
-            t_op, 0, to_onnx_perm
+            t_op, 0, to_executorch_perm
         )
         transpose_before.tmp_outputs[0].tensor_format = TensorFormat.CHANNELS_FIRST
         ops.add_pre(transpose_before)
@@ -72,7 +73,7 @@ def ensure_reduce_transposition(builder, ops: OpsList):
         ops.post_ops.insert(0, transpose_after)
 
     elif input_format.is_channels_last() and not output_format.is_channels_last():
-        # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ONNX.
+        # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ExecuTorch.
 
         permutation = list(
             translator.create_channels_last_to_channels_first_permutation(input_rank)
@@ -83,9 +84,9 @@ def ensure_reduce_transposition(builder, ops: OpsList):
         ops.add_pre(transpose)
 
     elif not input_format.is_channels_last() and output_format.is_channels_last():
-        # The ReduceX introduces format to the tensor
-        # The ONNX ReduceX outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator
-        # must be added, to change the tensor to 'channels last'.
+        # The reduction operator introduces format to the tensor.
+        # The ExecuTorch reduction operator outputs a 'channels first' tensor. This has to stay the same, and then a
+        #  Transpose operator must be added, to change the tensor to 'channels last'.
 
         permutation = list(
             translator.create_channels_first_to_channels_last_permutation(output_rank)
diff --git a/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py b/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py
index 0e55c27684b..55056614684 100755
--- a/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py
+++ b/backends/nxp/backend/ir/converter/node_converters/shared/reshape_transposition.py
@@ -1,4 +1,4 @@
-# Copyright 2023 NXP
+# Copyright 2023-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -158,7 +158,7 @@ def ensure_reshape_transposition(builder, ops: OpsList) -> list[int]:
     new_shape = output_tensor.shape.vector
 
     if input_format.is_channels_last() and not output_format.is_channels_last():
-        # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ONNX.
+        # The dimensions of the tensor lose their meaning! Insert a transpose op, to change input to match ExecuTorch.
 
         permutation = list(
             translator.create_channels_last_to_channels_first_permutation(input_rank)
@@ -170,7 +170,7 @@ def ensure_reshape_transposition(builder, ops: OpsList) -> list[int]:
 
     elif not input_format.is_channels_last() and output_format.is_channels_last():
         # The Reshape introduces format to the tensor (2D -> 4D  for example)
-        # The ONNX Reshape outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator
+        # The `view_copy` outputs a 'channels first' tensor. This has to stay the same, and then a Transpose operator
         # must be added, to change the tensor to 'channels last'.
 
         permutation = list(
diff --git a/backends/nxp/backend/ir/converter/quantization_utils.py b/backends/nxp/backend/ir/converter/quantization_utils.py
index d9e7674d953..11de4eec13c 100755
--- a/backends/nxp/backend/ir/converter/quantization_utils.py
+++ b/backends/nxp/backend/ir/converter/quantization_utils.py
@@ -1,111 +1,19 @@
-# Copyright 2023 NXP
+# Copyright 2023-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 import copy
-from typing import Iterable, List, Optional
-
-import executorch.backends.nxp.backend.ir.converter.builder.model_builder as model_builder
+from typing import List
 
 import numpy as np
+
 from executorch.backends.nxp.backend.ir import logger as logger
-from executorch.backends.nxp.backend.ir.converter.conversion.translator import (
-    tf_lite_type_to_numpy,
-)
-from executorch.backends.nxp.backend.ir.lib.tflite import TensorType as tflTensorType
-from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
 from executorch.backends.nxp.backend.ir.tflite_generator import (
     tflite_model as tflite_model,
 )
 
 
-def quantization_is_equal(
-    x_scale: np.ndarray,
-    x_zp: np.ndarray,
-    x_type: TensorType,
-    y_scale: np.ndarray,
-    y_zp: np.ndarray,
-    y_type: TensorType,
-) -> bool:
-    """Determine if provided quantization parameters of tensors 'x' and 'y' are the same.
-
-    :param x_scale: Scale of the 'x' tensor.
-    :param x_zp: Zero point of the 'x' tensor.
-    :param x_type: TFLite data type of the 'x' tensor.
-    :param y_scale: Scale of the 'y' tensor.
-    :param y_zp: Zero point of the 'y' tensor.
-    :param y_type: TFLite data type of the 'y' tensor.
-    :return: True, if the quantization parameters are equal.
-    """
-    if x_type != y_type:
-        return False
-
-    if not (x_scale.size == x_zp.size == y_scale.size == y_zp.size):
-        return False
-
-    x_scale, x_zp = quantization_params_to_lists(x_scale, x_zp)
-    y_scale, y_zp = quantization_params_to_lists(y_scale, y_zp)
-
-    return all(
-        x_s == y_s and x_z == y_z
-        for x_s, y_s, x_z, y_z in zip(x_scale, y_scale, x_zp, y_zp)
-    )
-
-
-def quantization_params_to_lists(
-    scale: np.ndarray, zero_point: np.ndarray
-) -> (List[float], List[int]):
-    if (scale is None) or (zero_point is None):
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "Missing zero_point and/or scale quantization params when converting to list!",
-        )
-
-    if (scale.size == 1) and (zero_point.size == 1):
-        # Per tensor quantization
-        scale = [scale.item()]
-        zero_point = [zero_point.item()]
-    elif (scale.size != 1) and (zero_point.size != 1):
-        # Per channel quantization
-        scale = scale.tolist()
-        zero_point = zero_point.tolist()
-    else:
-        logger.e(
-            logger.Code.CONVERSION_IMPOSSIBLE,
-            "TFLite doesn't support combination of per-channel and per-tensor quantization params.",
-        )
-
-    return scale, zero_point
-
-
-def is_quantization_valid(scale, zero_point):
-    return scale.size == zero_point.size
-
-
-def is_per_tensor_quantized(scale, zero_point):
-    return (scale.size == 1) and (zero_point.size == 1)
-
-
-def is_per_channel_quantized(scale, zero_point):
-    return is_quantization_valid(scale, zero_point) and not is_per_tensor_quantized(
-        scale, zero_point
-    )
-
-
-def get_symmetric_zero_point_for_type(tensor_type: TensorType):
-    match tensor_type:
-        case TensorType.INT8:
-            return 0
-        case TensorType.UINT8:
-            return 128
-        case _:
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                f"Attempt to get zero point definition for type: {tensor_type}",
-            )
-
-
 def _validate_or_set_quant_params(
     tensor: tflite_model.Tensor, quant: tflite_model.Quantization
 ) -> bool:
@@ -130,7 +38,7 @@ def propagate_quantization(
     """
     Propagates quantization parameters from from_tensor to to_tensor. If to_tensor already has the params set
     checks the consistency.
-    :raises: logger.Error - INVALID_ONNX_MODEL
+    :raises: logger.Error - INVALID_INPUT_MODEL
     """
 
     if (
@@ -147,7 +55,7 @@ def propagate_quantization(
     # noinspection PyTypeChecker
     if not _validate_or_set_quant_params(to_tensor, from_tensor.quantization):
         logger.e(
-            logger.Code.INVALID_ONNX_MODEL,
+            logger.Code.INVALID_INPUT_MODEL,
             f'Mismatched quantization parameters between tensors "{from_tensor.name}" and "{to_tensor.name}"',
         )
 
@@ -161,16 +69,16 @@ def set_quantization_parameters_to_tensor(
     """Create a TFLite QuantizationParameters object, initialize it from given parameters and add it to the
     'tflite_tensor'.
     :param tflite_tensor: The TFLite tensor in the model, to add the quantization to.
-    :param scale: The data of the tensor, which is an input of a quantized ONNX operator and represents the
+    :param scale: The data of the tensor, which is an input of a quantized ExecuTorch operator and represents the
                   quantization scale.
-    :param zero_point: The data of the tensor, which is an input of a quantized ONNX operator and represents the
+    :param zero_point: The data of the tensor, which is an input of a quantized ExecuTorch operator and represents the
                        quantization zero point.
     :param quantized_dimension: The quantized dimension attribute of TFLite QuantizationParameters.
     """
     if (scale is None) or (zero_point is None):
         logger.e(
             logger.Code.NOT_IMPLEMENTED,
-            "Conversion of ONNX quantized operators is only supported when "
+            "Conversion of ExecuTorch quantized operators is only supported when "
             "the quantization parameters are static!",
         )
 
@@ -184,8 +92,8 @@ def set_quantization_parameters_to_tensor(
 
         if scale.size != zero_point.size:
             logger.e(
-                logger.Code.INVALID_ONNX_MODEL,
-                f"The per channel quantization parameters of ONNX tensor "
+                logger.Code.INVALID_INPUT_MODEL,
+                f"The per channel quantization parameters of ExecuTorch tensor "
                 f"'{tflite_tensor.name}' are of different sizes! ('{scale.size}'"
                 f" != '{zero_point.size}')",
             )
@@ -193,8 +101,8 @@ def set_quantization_parameters_to_tensor(
         quantized_dimension_size = tflite_tensor.shape.get(quantized_dimension)
         if scale.size != quantized_dimension_size:
             logger.e(
-                logger.Code.INVALID_ONNX_MODEL,
-                f"The ONNX per channel quantization parameter vectors do not "
+                logger.Code.INVALID_INPUT_MODEL,
+                f"The ExecuTorch per channel quantization parameter vectors do not "
                 f"match the size of the quantized dimension! ('{scale.size}' != "
                 f"'{quantized_dimension_size}')",
             )
@@ -205,8 +113,8 @@ def set_quantization_parameters_to_tensor(
     else:
         # Combination of per tensor and per channel quantization parameters
         logger.e(
-            logger.Code.INVALID_ONNX_MODEL,
-            f"ONNX tensor '{tflite_tensor.name}' uses a combination of per "
+            logger.Code.INVALID_INPUT_MODEL,
+            f"ExecuTorch node '{tflite_tensor.name}' uses a combination of per "
             f"tensor and per channel quantization parameters. Conversion to "
             f"TFLite is not possible!",
         )
@@ -218,33 +126,12 @@ def set_quantization_parameters_to_tensor(
     )
     if not _validate_or_set_quant_params(tflite_tensor, quant):
         logger.e(
-            logger.Code.INVALID_ONNX_MODEL,
+            logger.Code.INVALID_INPUT_MODEL,
             f'Mismatched quantization parameters between tensors: "{tflite_tensor.name}" already '
             f"has the quantization params set",
         )
 
 
-def calculate_uint_to_int_re_quantization_zero_point(
-    data_type_byte_size: int, old_zero_point: Iterable[int]
-) -> np.ndarray:
-    """
-        Calculate the new zero points, after a quantized tensor with an unsigned int data type is re-quantized to
-        a signed type.
-    :param data_type_byte_size: Size of the data type that is used, in Bytes. For example 1 for INT8.
-    :param old_zero_point: The zero point quantisation parameter, of the original data, before re-quantization.
-    :return: The new zero point quantisation parameter, after re-quantization.
-    """
-    data_type_bit_size = 8 * data_type_byte_size
-    zero_point_shift = 2 ** (data_type_bit_size - 1)
-    return np.asarray(np.subtract(np.array(old_zero_point, np.int32), zero_point_shift))
-
-
-def _re_quantize_uint8_to_int8(tensor_data: np.ndarray) -> np.ndarray:
-    """Re-quantize static uint8 data to int8."""
-    int16_data = np.asarray(tensor_data, np.int16)
-    return np.array(int16_data - 128, np.int8)
-
-
 def quantize_int8(
     data: np.ndarray, scale: List[float], zero_point: List[int]
 ) -> np.ndarray:
@@ -252,20 +139,6 @@ def quantize_int8(
     return np.clip(new_data, -128, 127).astype(np.int8)
 
 
-def quantize_uint8(
-    data: np.ndarray, scale: List[float], zero_point: List[int]
-) -> np.ndarray:
-    new_data = np.add(np.round(np.divide(data, scale)), zero_point)
-    return np.clip(new_data, 0, 255).astype(np.uint8)
-
-
-def quantize_int32(
-    data: np.ndarray, scale: List[float], zero_point: List[int]
-) -> np.ndarray:
-    new_data = np.add(np.round(np.divide(data, scale)), zero_point)
-    return np.clip(new_data, -2_147_483_648, 2_147_483_648).astype(np.int32)
-
-
 def dequantize(
     data: np.ndarray, scale: List[float], zero_point: List[int]
 ) -> np.ndarray:
@@ -274,211 +147,3 @@ def dequantize(
         scale,
         dtype=np.float32,
     )
-
-
-def re_quantize_static_tensor(
-    builder: "model_builder.ModelBuilder",
-    tflite_tensor: tflite_model.Tensor,
-    to_type: tflTensorType.TensorType,
-    new_scale: Optional[List[float]] = None,
-    new_zero_point: Optional[List[int]] = None,
-) -> tflite_model.Tensor:
-    """Create a new TFLite Tensor with new quantization parameters, type and data.
-
-    :param builder: A ModelBuilder instance.
-    :param tflite_tensor: TFLite tensor to re-quantize.
-    :param to_type: The TFLite TensorType, that the tensor will be re-quantized to.
-    :param new_scale: New scale quantization parameter. Used only when re-quantizing to the same type.
-    :param new_zero_point: New zero point quantization parameter. Used only when re-quantizing to the same type.
-    :return: A new re-quantized tensor.
-    """
-    if tflite_tensor.quantization is None:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "translator.re_quantize_static_tensor(): Got tensor without quantization!",
-        )
-
-    if tflite_tensor.tmp_buffer.data is None:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "translator.re_quantize_static_tensor(): Got tensor without static data!",
-        )
-
-    new_dtype = tf_lite_type_to_numpy(to_type)
-    re_quantized_tensor = builder.duplicate_tensor(tflite_tensor)
-    tensor_data = re_quantized_tensor.tmp_buffer.data
-
-    if tensor_data.dtype == np.uint8 and new_dtype == np.int8:  # INT8 -> UINT8
-        re_quantized_tensor.tmp_buffer.data = _re_quantize_uint8_to_int8(tensor_data)
-        re_quantized_tensor.type = tflTensorType.TensorType.INT8
-        calculated_zero_point = calculate_uint_to_int_re_quantization_zero_point(
-            1, re_quantized_tensor.quantization.zero_point.vector
-        )
-        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(calculated_zero_point)
-        )
-
-    elif tensor_data.dtype == np.int32 and new_dtype == np.int8:  # INT32 -> INT8
-        if new_zero_point is None or new_scale is None:
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                "Missing new zero_point or new scale when re-quantizing tensor.",
-            )
-
-        old_zp = re_quantized_tensor.quantization.zero_point.vector
-        old_scale = re_quantized_tensor.quantization.scale.vector
-        float_data = dequantize(tensor_data, old_scale, old_zp)
-        int8_data = quantize_int8(float_data, new_scale, new_zero_point)
-
-        re_quantized_tensor.tmp_buffer.data = int8_data
-        re_quantized_tensor.type = tflTensorType.TensorType.INT8
-        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(new_zero_point)
-        )
-        re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale))
-
-    elif tensor_data.dtype == np.int32 and new_dtype == np.uint8:  # INT32 -> UINT8
-        if new_zero_point is None or new_scale is None:
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                "Missing new zero_point or new scale when re-quantizing tensor.",
-            )
-
-        old_zp = re_quantized_tensor.quantization.zero_point.vector
-        old_scale = re_quantized_tensor.quantization.scale.vector
-        float_data = dequantize(tensor_data, old_scale, old_zp)
-        uint8_data = quantize_uint8(float_data, new_scale, new_zero_point)
-
-        re_quantized_tensor.tmp_buffer.data = uint8_data
-        re_quantized_tensor.type = tflTensorType.TensorType.UINT8
-        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(new_zero_point)
-        )
-        re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale))
-
-    elif tensor_data.dtype == np.int8 and new_dtype == np.int8:  # INT8 -> INT8
-        # Re-quantizing int8 tensor data with different quantization parameters
-        if new_zero_point is None or new_scale is None:
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                "Missing new zero_point or new scale when re-quantizing tensor.",
-            )
-
-        zero_point_data = re_quantized_tensor.quantization.zero_point.vector
-        scale_data = re_quantized_tensor.quantization.scale.vector
-        new_tensor_data = dequantize(tensor_data, scale_data, zero_point_data)
-
-        re_quantized_tensor.tmp_buffer.data = quantize_int8(
-            new_tensor_data, new_scale, new_zero_point
-        )
-        re_quantized_tensor.quantization.scale = tflite_model.Scale(new_scale)
-        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            new_zero_point
-        )
-
-    elif tensor_data.dtype == np.int32 and new_dtype == np.int32:  # INT32 -> INT32
-        if new_zero_point is None or new_scale is None:
-            logger.e(
-                logger.Code.INTERNAL_ERROR,
-                "Missing new zero_point or new scale when re-quantizing tensor.",
-            )
-
-        old_zp = re_quantized_tensor.quantization.zero_point.vector
-        old_scale = re_quantized_tensor.quantization.scale.vector
-        float_data = dequantize(tensor_data, old_scale, old_zp)
-        int32_data = quantize_int32(float_data, new_scale, new_zero_point)
-
-        re_quantized_tensor.tmp_buffer.data = int32_data
-        re_quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(new_zero_point)
-        )
-        re_quantized_tensor.quantization.scale = tflite_model.Scale(list(new_scale))
-
-    else:
-        logger.e(
-            logger.Code.NOT_IMPLEMENTED,
-            f"Re-quantization of static tensors from type '{tensor_data.dtype}' "
-            f"to type '{to_type}' is not yet implemented!",
-        )
-
-    return re_quantized_tensor
-
-
-def quantize_static_float_tensor(
-    builder: "model_builder.ModelBuilder",
-    tflite_tensor: tflite_model.Tensor,
-    to_type: tflTensorType.TensorType,
-    scale: List[float],
-    zero_point: List[int],
-    quantized_dimension: int = 0,
-) -> tflite_model.Tensor:
-    """Quantize tensor 'tflite_tensor' with passed quantization params.
-
-    :param builder: A ModelBuilder instance.
-    :param tflite_tensor: TFLite tensor to quantize.
-    :param to_type: The TFLite TensorType, that the tensor will be quantized to.
-    :param scale: Scale quantization parameter.
-    :param zero_point: Zero point quantization parameter.
-    :param quantized_dimension: Quantized dimension.
-    """
-    if tflite_tensor.quantization is not None:
-        logger.e(logger.Code.INTERNAL_ERROR, "Got tensor with quantization!")
-
-    if tflite_tensor.tmp_buffer.data is None:
-        logger.e(logger.Code.INTERNAL_ERROR, "Got tensor without static data!")
-
-    quantized_tensor = builder.duplicate_tensor(tflite_tensor)
-    tensor_data = quantized_tensor.tmp_buffer.data
-
-    if zero_point is None or scale is None:
-        logger.e(
-            logger.Code.INTERNAL_ERROR,
-            "Missing new zero_point or new scale when quantizing tensor.",
-        )
-
-    new_dtype = tf_lite_type_to_numpy(to_type)
-
-    if tensor_data.dtype == np.float32 and new_dtype == np.int8:
-        int8_data = quantize_int8(tensor_data, scale, zero_point)
-
-        quantized_tensor.tmp_buffer.data = int8_data
-        quantized_tensor.type = tflTensorType.TensorType.INT8
-        quantized_tensor.quantization = tflite_model.Quantization()
-        quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(zero_point)
-        )
-        quantized_tensor.quantization.scale = tflite_model.Scale(list(scale))
-        quantized_tensor.quantization.quantized_dimension = quantized_dimension
-
-    elif tensor_data.dtype == np.float32 and new_dtype == np.uint8:
-        uint8_data = quantize_uint8(tensor_data, scale, zero_point)
-
-        quantized_tensor.tmp_buffer.data = uint8_data
-        quantized_tensor.type = tflTensorType.TensorType.UINT8
-        quantized_tensor.quantization = tflite_model.Quantization()
-        quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(zero_point)
-        )
-        quantized_tensor.quantization.scale = tflite_model.Scale(list(scale))
-        quantized_tensor.quantization.quantized_dimension = quantized_dimension
-
-    elif tensor_data.dtype == np.float32 and new_dtype == np.int32:
-        int32_data = quantize_int32(tensor_data, scale, zero_point)
-
-        quantized_tensor.tmp_buffer.data = int32_data
-        quantized_tensor.type = tflTensorType.TensorType.INT32
-        quantized_tensor.quantization = tflite_model.Quantization()
-        quantized_tensor.quantization.zero_point = tflite_model.ZeroPoint(
-            list(zero_point)
-        )
-        quantized_tensor.quantization.scale = tflite_model.Scale(list(scale))
-        quantized_tensor.quantization.quantized_dimension = quantized_dimension
-
-    else:
-        logger.e(
-            logger.Code.NOT_IMPLEMENTED,
-            f"Quantization of static tensors from type '{tensor_data.dtype}' "
-            f"to type '{to_type}' is not yet implemented!",
-        )
-
-    return quantized_tensor
diff --git a/backends/nxp/backend/ir/logger.py b/backends/nxp/backend/ir/logger.py
index ce8da2a31df..8019fb4d780 100644
--- a/backends/nxp/backend/ir/logger.py
+++ b/backends/nxp/backend/ir/logger.py
@@ -1,6 +1,6 @@
 #
 # Copyright 2023 Martin Pavella
-# Copyright 2023 NXP
+# Copyright 2023-2025 NXP
 #
 # License: MIT
 # See the LICENSE_MIT for more details.
@@ -85,18 +85,18 @@ class Code(Enum):
     PREPROCESSING_ERROR = 4
 
     UNSUPPORTED_OPERATOR = 21
-    UNSUPPORTED_ONNX_TYPE = 22
+    # Code 22 was removed.
     UNSUPPORTED_OPERATOR_ATTRIBUTES = 23
     NOT_IMPLEMENTED = 24
 
     INVALID_TYPE = 31
     INVALID_TENSOR_SHAPE = 32
-    INVALID_ONNX_OPERATOR = 33
-    INVALID_ONNX_OPERATOR_ATTRIBUTE = 34
-    INVALID_ONNX_MODEL = 35
+    # Code 33 was removed.
+    INVALID_OPERATOR_ATTRIBUTE = 34
+    INVALID_INPUT_MODEL = 35
 
     CONVERSION_IMPOSSIBLE = 41
-    SHAPE_INFERENCE_ERROR = 42
+    # Code 42 was removed.
     IO_PRESERVATION_ERROR = 43
 
     INVALID_INPUT = 51
@@ -142,8 +142,6 @@ class BasicLoggingContext(LoggingContext):
     """
 
     GLOBAL = LoggingContext("global")
-    SHAPE_INFERENCE = LoggingContext("shape_inference")
-    ONNX_PARSER = LoggingContext("onnx_parser")
     OPERATOR_CONVERSION = LoggingContext("operator_conversion")
     TFLITE_GENERATOR = LoggingContext("tflite_generator")
     QDQ_QUANTIZER = LoggingContext("qdq_quantizer")
@@ -151,7 +149,7 @@ class BasicLoggingContext(LoggingContext):
 
 class NodeLoggingContext(LoggingContext):
     """
-    ONNX node specific context. Logs reported within this context are related to node with index 'node_id'.
+    ExecuTorch node specific context. Logs reported within this context are related to node with index 'node_id'.
     """
 
     def __init__(self, node_id):
@@ -213,7 +211,7 @@ def _get_node_error(self, node_id: int, dict_item: str) -> Code | str | None:
         Return first error log item that belong to node with id 'node_id'. If no error is present
         None is returned instead.
 
-        :param node_id: ONNX node id.
+        :param node_id: ExecuTorch node id.
         :param dict_item: Dictionary item to return from `log`
         :return: Error code or None if there's no error related to node.
         """
@@ -230,7 +228,7 @@ def get_node_error_code(self, node_id: int) -> Code | None:
         Return first error code that belong to node with id 'node_id'. If no error is present
         None is returned instead.
 
-        :param node_id: ONNX node id.
+        :param node_id: ExecuTorch node id.
         :return: Error code or None if there's no error related to node.
         """
 
@@ -241,7 +239,7 @@ def get_node_error_message(self, node_id: int) -> str | None:
         Return first error message that belong to node with id 'node_id'. If no error is present
         None is returned instead.
 
-        :param node_id: ONNX node id
+        :param node_id: ExecuTorch node id
         :return: Error message or None if there is no error related to node.
         """
 
@@ -256,7 +254,7 @@ class loggingContext:
     Context manager used to nest logging contexts. Usage:
 
     with loggingContext(BasicLoggingContext.GLOBAL):
-        with loggingContext(BasicLoggingContext.ONNX_PARSER):
+        with loggingContext(BasicLoggingContext.OPERATOR_CONVERSION):
             logger.i("My log") # this log is automatically assigned to both parent contexts
 
     """
diff --git a/backends/nxp/backend/ir/tensor_formatting.py b/backends/nxp/backend/ir/tensor_formatting.py
index aab22c3c368..db24576e81f 100644
--- a/backends/nxp/backend/ir/tensor_formatting.py
+++ b/backends/nxp/backend/ir/tensor_formatting.py
@@ -1,6 +1,5 @@
-#
 # Copyright 2023 Martin Pavella
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # License: MIT
 # See the LICENSE_MIT for more details.
@@ -26,7 +25,7 @@ class TensorFormat(Enum):
     TRANSPOSE_CONV_2D_WEIGHT_FORMAT = 13
 
     # No special format (matrices, vectors, shapes etc.). All tensors with the FORMATLESS format MUST have EXACTLY
-    #  the same shape and data in the TFLite model and in the ONNX model.
+    #  the same shape and data in the NeutronIR model and in the ExecuTorch model.
     FORMATLESS = 20
 
     NONE = 30  # Format has not been identified
diff --git a/backends/nxp/backend/ir/tflite_generator/tflite_model.py b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
index a9384861178..76a50a2e177 100755
--- a/backends/nxp/backend/ir/tflite_generator/tflite_model.py
+++ b/backends/nxp/backend/ir/tflite_generator/tflite_model.py
@@ -1,6 +1,5 @@
-#
 # Copyright 2023 Martin Pavella
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # License: MIT
 # See the LICENSE_MIT for more details.
@@ -272,8 +271,7 @@ def is_per_tensor(self) -> bool:
         return False
 
     def gen_tflite(self, builder: fb.Builder):
-        # Sometimes 1D per-tensor quantized tensors can have quantized_dimension != 0
-        # (residue from badly defined ONNX models). This would cause TFLite inference to crash.
+        # Sometimes 1D per-tensor quantized tensors can have quantized_dimension != 0.
         if not self.is_per_channel():
             self.quantized_dimension = 0
 
@@ -513,7 +511,7 @@ class Operator(meta.TFLiteObject):
     tmp_outputs: List[Tensor]
     tmp_version: int  # OperatorConverter uses this to assign the corresponding operator code with correct version.
 
-    # If `True`, this is an extra operator added during conversion. It was not present in the original ONNX model.
+    # If `True`, this is an extra operator added during conversion. It was not present in the original input model.
     tmp_added_extra: bool
 
     def __init__(
diff --git a/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py b/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py
index 253dc9c69a1..e861eff0d18 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/operator_rules.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -100,23 +100,3 @@ def __call__(
             operator_is_type(preceding_op, self.single_preceding_op_type, builder)
             for preceding_op in preceding_ops
         )
-
-
-@dataclass
-class WasNotInTheOriginalONNXModel(OpRule):
-    """Assures that this operator wasn't created by converting an ONNX operator from the original model, but instead
-     was added extra in order to convert a different operator.
-
-    This rule is currently only satisfied for operators added by ModelBuilder methods `create_..._before()` and
-     `create_..._after()`.
-    """
-
-    def __call__(
-        self,
-        op: tflite_model.Operator,
-        tensor_map: NameToTensorMap,
-        input_to_ops_map: InputTensorToOpsMap,
-        output_to_op_map: OutputTensorToOpMap,
-        builder: "model_builder.ModelBuilder",
-    ) -> bool:
-        return op.tmp_added_extra
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py
deleted file mode 100755
index dddabfe87f1..00000000000
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/combine_hard_sigmoid_and_mul_to_hard_swish.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2024 NXP
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import (
-    BuiltinOperator,
-)
-from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
-from executorch.backends.nxp.backend.ir.tflite_generator import tflite_model
-from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.hard_swish_options import (
-    HardSwish,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
-    BaseOptimization,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
-    OneOf,
-    Op,
-    PatternMatcher,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
-    RuleOr,
-    TensorHasNConsumers,
-    TensorHasStaticValue,
-    TensorHasType,
-    TensorsAreQuantized,
-    TensorsHaveOneConsumer,
-    TensorsHaveType,
-)
-
-
-class CombineHardSigmoidAndMulIntoHardSwish(BaseOptimization):
-
-    def __call__(self) -> bool:
-        made_changes = self._combine_float_variant()
-        made_changes |= self._combine_quantized_variant()
-
-        return made_changes
-
-    def _combine_float_variant(self) -> bool:
-        """Fuse some operators in the following pattern. The ops `Mul`, `Add` `Minimum` and `Relu` compute the
-        `HardSigmoid` operation, as there is no `HardSigmoid` operator in TFLite.
-
-                      ┌─────┴─────┐  `x`
-                   ┌──▼──┐        │
-             1/6 ──► Mul │        │
-                   └──┬──┘        │
-                   ┌──▼──┐        │
-             1/2 ──► Add │        │                           │
-                   └──┬──┘        │                     ┌─────▼─────┐
-                 ┌────▼────┐      │       ─────►        │ HardSwish │
-             1 ──► Minimum │      │                     └─────┬─────┘
-                 └────┬────┘      │
-                   ┌──▼───┐       │
-                   │ Relu │       │
-                   └──┬───┘       │
-                      └───┐   ┌───┘
-                         ┌▼───▼┐
-                         │ Mul │
-                         └──┬──┘
-        """
-
-        matcher = PatternMatcher(
-            self._builder,
-            [
-                Op(["Mul"], ["x", "alpha"], ["mul_o"]),
-                OneOf(
-                    [
-                        Op(["Add"], ["mul_o", "beta"], ["add_o"]),
-                        Op(["Add"], ["beta", "mul_o"], ["add_o"]),
-                    ]
-                ),
-                OneOf(
-                    [
-                        Op(["Minimum"], ["add_o", "one"], ["min_o"]),
-                        Op(["Minimum"], ["one", "add_o"], ["min_o"]),
-                    ]
-                ),
-                Op(["Relu"], ["min_o"], ["relu_o"]),
-                OneOf(
-                    [
-                        Op(["Mul"], ["x", "relu_o"], ["y"]),
-                        Op(["Mul"], ["relu_o", "x"], ["y"]),
-                    ]
-                ),
-            ],
-            [
-                TensorHasNConsumers("x", 2),
-                TensorsHaveOneConsumer(["mul_o", "add_o", "min_o", "relu_o"]),
-                TensorHasStaticValue("alpha", 1 / 6),
-                TensorHasStaticValue("beta", 0.5),
-                TensorHasStaticValue("one", 1),
-                # `HardSwishConverter` and `HardSigmoidConverter` both only support float32.
-                TensorHasType("x", TensorType.FLOAT32),
-            ],
-        )
-
-        # The mapped operator (value) will be inserted into the model later, at the position of the `key` operator.
-        to_add: dict[tflite_model.Operator, tflite_model.Operator] = {}
-        to_remove = []
-        for pattern_ops, tensor_map, _, _ in matcher.match_patterns():
-            x, y = tensor_map["x"], tensor_map["y"]
-            hard_swish = tflite_model.Operator(
-                builtin_options=HardSwish(),
-                opcode_index=self._builder.op_code_index_for_op_type(
-                    BuiltinOperator.HARD_SWISH
-                ),
-            )
-            hard_swish.tmp_inputs = [x]
-            hard_swish.tmp_outputs = [y]
-
-            to_add[pattern_ops[0]] = hard_swish
-
-            to_remove.extend(pattern_ops)
-
-        ops = self._builder.get_operators()
-        for k, v in to_add.items():
-            idx = ops.index(k)
-            ops.insert(idx, v)
-
-        for op in to_remove:
-            ops.remove(op)
-
-        return len(to_remove) != 0
-
-    def _combine_quantized_variant(self) -> bool:
-        """Fuse some operators in the following pattern. The ops `Mul`, `Add` `Minimum` and `Relu` compute the
-         `HardSigmoid` operation, as there is no `HardSigmoid` operator in TFLite.
-
-        The following pattern arises from using the `onnx2quant` on a model with `HardSwish`. The quantizer always
-         runs a pre-processing step which splits the ONNX `HardSwish` into `HardSigmoid` and `Mul`. It seems like it
-         cannot be turned off. Therefore, we cannot add QDQ quantization of `HardSwish`. But since `HardSigmoid`
-         gets converted to multiple TFLite operators, we also cannot really add QDQ quantization for that operator.
-         This means that `HardSwish` will never get fully quantized by the `onnx2quant`, and the following pattern
-         will be created.
-        We can, however, convert the entire pattern into a quantized `HardSwish` using this optimization.
-
-                             │  (u)int8    `x`
-                       ┌─────▼──────┐
-                       │ Dequantize │
-                       └─────┬──────┘
-                       ┌─────┴─────┐  float32
-                    ┌──▼──┐        │
-              1/6 ──► Mul │        │
-                    └──┬──┘        │
-                    ┌──▼──┐        │
-              1/2 ──► Add │        │
-                    └──┬──┘        │
-                  ┌────▼────┐      │
-              1 ──► Minimum │      │                           │  (u)int8    `x`
-                  └────┬────┘      │                     ┌─────▼─────┐
-                    ┌──▼───┐       │       ─────►        │ HardSwish │
-                    │ Relu │       │                     └─────┬─────┘
-                    └──┬───┘       │                           │  (u)int8    `y`
-                  ┌────▼─────┐     │
-                  │ Quantize │     │
-                  └────┬─────┘     │
-                 ┌─────▼──────┐    │
-                 │ Dequantize │    │
-                 └─────┬──────┘    │
-                       └───┐   ┌───┘
-                          ┌▼───▼┐
-                          │ Mul │
-                          └──┬──┘
-                             │  float32
-                        ┌────▼─────┐
-                        │ Quantize │
-                        └────┬─────┘
-                             │  (u)int8    `y`
-        """
-        matcher = PatternMatcher(
-            self._builder,
-            [
-                Op(["Dequantize"], ["x"], ["deq1_o"]),
-                OneOf(
-                    [
-                        Op(["Mul"], ["deq1_o", "alpha"], ["mul1_o"]),
-                        Op(["Mul"], ["alpha", "deq1_o"], ["mul1_o"]),
-                    ]
-                ),
-                OneOf(
-                    [
-                        Op(["Add"], ["mul1_o", "beta"], ["add_o"]),
-                        Op(["Add"], ["beta", "mul1_o"], ["add_o"]),
-                    ]
-                ),
-                OneOf(
-                    [
-                        Op(["Minimum"], ["add_o", "one"], ["min_o"]),
-                        Op(["Minimum"], ["one", "add_o"], ["min_o"]),
-                    ]
-                ),
-                Op(["Relu"], ["min_o"], ["relu_o"]),
-                Op(["Quantize"], ["relu_o"], ["quant1_o"]),
-                Op(["Dequantize"], ["quant1_o"], ["deq2_o"]),
-                OneOf(
-                    [
-                        Op(["Mul"], ["deq1_o", "deq2_o"], ["mul2_o"]),
-                        Op(["Mul"], ["deq2_o", "deq1_o"], ["mul2_o"]),
-                    ]
-                ),
-                Op(["Quantize"], ["mul2_o"], ["y"]),
-            ],
-            [
-                TensorHasNConsumers("deq1_o", 2),
-                TensorsHaveOneConsumer(
-                    [
-                        "mul1_o",
-                        "add_o",
-                        "min_o",
-                        "relu_o",
-                        "quant1_o",
-                        "deq2_o",
-                        "mul2_o",
-                    ]
-                ),
-                TensorHasStaticValue("alpha", 1 / 6),
-                TensorHasStaticValue("beta", 0.5),
-                TensorHasStaticValue("one", 1),
-                TensorHasType("deq1_o", TensorType.FLOAT32),
-                TensorsAreQuantized(["x", "y"]),
-                RuleOr(
-                    TensorsHaveType(["x", "y"], TensorType.INT8),
-                    TensorsHaveType(["x", "y"], TensorType.UINT8),
-                ),
-            ],
-        )
-
-        # The mapped operator (value) will be inserted into the model later, at the position of the `key` operator.
-        to_add: dict[tflite_model.Operator, tflite_model.Operator] = {}
-        to_remove = []
-        for pattern_ops, tensor_map, _, _ in matcher.match_patterns():
-            x, y = tensor_map["x"], tensor_map["y"]
-            hard_swish = tflite_model.Operator(
-                builtin_options=HardSwish(),
-                opcode_index=self._builder.op_code_index_for_op_type(
-                    BuiltinOperator.HARD_SWISH
-                ),
-            )
-            hard_swish.tmp_inputs = [x]
-            hard_swish.tmp_outputs = [y]
-
-            to_add[pattern_ops[0]] = hard_swish
-
-            to_remove.extend(pattern_ops)
-
-        ops = self._builder.get_operators()
-        for k, v in to_add.items():
-            idx = ops.index(k)
-            ops.insert(idx, v)
-
-        for op in to_remove:
-            ops.remove(op)
-
-        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py
deleted file mode 100755
index b6fd5849551..00000000000
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/fuse_fully_connected_and_add_operators.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2024 NXP
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from executorch.backends.nxp.backend.ir.lib.tflite.TensorType import TensorType
-from executorch.backends.nxp.backend.ir.tflite_optimizer.operator_rules import (
-    NoFusedActivationFunction,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.base_optimization import (
-    BaseOptimization,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.pattern_matcher import (
-    OneOf,
-    Op,
-    PatternMatcher,
-)
-from executorch.backends.nxp.backend.ir.tflite_optimizer.tensor_rules import (
-    RuleAnd,
-    RuleIf,
-    RuleOr,
-    TensorDimensionsMatch,
-    TensorHasDimensionOfSize,
-    TensorHasOneConsumer,
-    TensorHasRank,
-    TensorHasType,
-    TensorIsQuantized,
-)
-
-
-class FuseFullyConnectedAndAddOperators(BaseOptimization):
-
-    def __call__(self) -> bool:
-        """
-        FullyConnected -> Add sequence can handle more complicated shapes than just FullyConnected with bias
-         (due to shape broadcasting).
-        The bias can have shape [N] or [1, N], where N is the first dimension of the FC weights tensor.
-         It could also have shape [1, ..., 1, N], but then the TFLite FullyConnected removes the leading ones,
-         even if 'keep_num_dims' is True. In ONNX, the output tensor has the leading ones,
-         In this case, a Reshape would have to be added, so we do not perform the fusion.
-
-        # https://github.com/tensorflow/tensorflow/blob/v2.15.0/tensorflow/lite/kernels/fully_connected.cc#L398
-        """
-        matcher = PatternMatcher(
-            self._builder,
-            [
-                # Require exactly 2 inputs.
-                Op(
-                    ["FullyConnected"], ["x", "w"], ["y"], [NoFusedActivationFunction()]
-                ),
-                OneOf([Op(["Add"], ["y", "b"]), Op(["Add"], ["b", "y"])]),
-            ],
-            [
-                TensorHasOneConsumer("y"),
-                TensorHasRank("w", 2),
-                RuleOr(
-                    TensorHasRank("b", 1),
-                    RuleAnd(TensorHasRank("b", 2), TensorHasDimensionOfSize("b", 0, 1)),
-                ),
-                TensorDimensionsMatch("w", 0, "b", -1),
-                RuleIf(TensorIsQuantized("x"), TensorHasType("b", TensorType.INT32)),
-            ],
-        )
-
-        to_remove = []
-        for (fc, add), tensor_map, _, _ in matcher.match_patterns():
-            b = tensor_map["b"]
-            fc.tmp_inputs.append(b)
-
-            # Remove the 'Add' operator.
-            fc.tmp_outputs[0] = add.tmp_outputs[0]
-            fc.builtin_options.fused_activation_function = (
-                add.builtin_options.fused_activation_function
-            )
-            to_remove.append(add)
-
-        for op in to_remove:
-            self._builder.get_operators().remove(op)
-
-        return len(to_remove) != 0
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py
index 42eefc1ab56..ef76fad90de 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/permute_fully_connected_weights_after_reshape.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -50,7 +50,7 @@ def __call__(self) -> bool:
         How it works:
             - The original model doesn't have the `Transpose`. It just has `Reshape` into `MatMul` (or `Gemm`...).
             - The `Transpose` is added, because the `Reshape` has a channels last input, which was originally
-                channels first (in the ONNX model), and so the 2D output of the `Reshape` would have the same data.
+                channels first (in the ExecuTorch model), and so the 2D output of the `Reshape` would have the same data.
                 but at different locations. The `Transpose` makes the input channels first, which ensures correct
                 output of the `Reshape`.
             - In the scenario in the graph above, it is possible to omit the `Transpose`, which causes the `Reshape`
@@ -85,12 +85,12 @@ def __call__(self) -> bool:
         for (transpose, reshape, fc), tensor_map, _, _ in matcher.match_patterns():
             # Make sure the `Transpose` is applying the expected permutation.
             y = tensor_map["y"]
-            to_onnx_perm = (
+            to_executorch_perm = (
                 translator.create_channels_last_to_channels_first_permutation(
                     y.shape.len()
                 )
             )
-            if not np.allclose(to_onnx_perm, tensor_map["perm"].tmp_buffer.data):
+            if not np.allclose(to_executorch_perm, tensor_map["perm"].tmp_buffer.data):
                 continue  # The `Transpose` has an unexpected permutation.
 
             w = tensor_map["w"]
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
index dc9ad9999b4..0be46efcaa8 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizations/prune_transpose_operators.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
index d4a097ca76d..69b75b72cdd 100755
--- a/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
+++ b/backends/nxp/backend/ir/tflite_optimizer/optimizer.py
@@ -11,15 +11,9 @@
 
 from executorch.backends.nxp.backend.ir import logger
 from executorch.backends.nxp.backend.ir.conversion_config import ConversionConfig
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.combine_hard_sigmoid_and_mul_to_hard_swish import (
-    CombineHardSigmoidAndMulIntoHardSwish,
-)
 from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_activation_functions import (
     FuseActivationFunctions,
 )
-from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.fuse_fully_connected_and_add_operators import (
-    FuseFullyConnectedAndAddOperators,
-)
 from executorch.backends.nxp.backend.ir.tflite_optimizer.optimizations.move_relu_before_concat import (
     MoveActivationBeforeConcatenation,
 )
@@ -34,7 +28,6 @@
 
 class Optimization(Enum):
     FUSE_ACTIVATION_FUNCTIONS = 1
-    FUSE_FULLY_CONNECTED_AND_ADD = 2
 
     FUSE_TRANSPOSE_OPERATORS = 5
     REMOVE_IDENTITY_TRANSPOSE_OPERATORS = 6
@@ -42,7 +35,6 @@ class Optimization(Enum):
     PERMUTE_FULLY_CONNECTED_WEIGHTS_AFTER_RESHAPE = 12
 
     MOVE_ACTIVATION_BEFORE_CONCAT = 15
-    COMBINE_HARD_SIGMOID_AND_MUL_INTO_HARD_SWISH = 16
 
 
 class Optimizer:
@@ -75,9 +67,6 @@ def __init__(
             Optimization.FUSE_ACTIVATION_FUNCTIONS: FuseActivationFunctions(
                 builder, conversion_config
             ),
-            Optimization.FUSE_FULLY_CONNECTED_AND_ADD: FuseFullyConnectedAndAddOperators(
-                builder, conversion_config
-            ),
             Optimization.FUSE_TRANSPOSE_OPERATORS: FuseTransposeOperators(
                 builder, conversion_config
             ),
@@ -90,9 +79,6 @@ def __init__(
             Optimization.MOVE_ACTIVATION_BEFORE_CONCAT: MoveActivationBeforeConcatenation(
                 builder, conversion_config
             ),
-            Optimization.COMBINE_HARD_SIGMOID_AND_MUL_INTO_HARD_SWISH: CombineHardSigmoidAndMulIntoHardSwish(
-                builder, conversion_config
-            ),
         }
 
     def optimize(
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
index 2bc4380f89b..a6884a9ee24 100644
--- a/backends/nxp/backend/neutron_converter_manager.py
+++ b/backends/nxp/backend/neutron_converter_manager.py
@@ -7,8 +7,6 @@
 import multiprocessing
 import pkgutil
 
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
-
 
 def convert_unsafe(neutron_converter, tflite_model, cctx, queue):
     """
@@ -27,16 +25,7 @@ class NeutronConverterManager:
     contains NeutronGraph nodes.
     """
 
-    _supported_target_names = [Target.RT700.value]
-
-    def convert(
-        self, tflite_model: bytes, target: str, neutron_converter_flavor: str
-    ) -> bytes:
-        # Neutron converter crashes if we provide invalid target -> verify.
-        if target not in self._supported_target_names:
-            raise RuntimeError(
-                f"Target '{target}' is not supported by NeutronConverterManager."
-            )
+    def __init__(self, neutron_converter_flavor: str = "SDK_25_09"):
 
         neutron_converter_modules = [
             module.name
@@ -57,13 +46,34 @@ def convert(
                     f"not found. Install 'neutron_converter_[flavor]' Python package."
                 )
 
-        neutron_converter = importlib.import_module(
+        self.neutron_converter = importlib.import_module(
             f"{requested_module_name}.neutron_converter"
         )
+        self.neutron_library_utils = importlib.import_module(
+            f"{requested_module_name}.neutron_library_utils"
+        )
+
+    def get_converter(self):
+        return self.neutron_converter
+
+    def get_library_utils(self):
+        return self.neutron_library_utils
+
+    def verify_target(self, target: str):
+        if not self.neutron_library_utils.isNeutronTarget(target):
+            valid_targets = [
+                target.name for target in self.neutron_library_utils.getNeutronTargets()
+            ]
+            raise ValueError(
+                f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`."
+            )
+
+    def convert(self, tflite_model: bytes, target: str) -> bytes:
+        # Neutron converter crashes if we provide invalid target -> verify.
+        self.verify_target(target)
 
-        cctx = neutron_converter.CompilationContext()
-        cctx.targetOpts = neutron_converter.getNeutronTarget(target)
-        # New switch since Neutron Converter SDK_25.06
+        cctx = self.neutron_converter.CompilationContext()
+        cctx.targetOpts = self.neutron_converter.getNeutronTarget(target)
         cctx.compilationOpts.minNumOpsPerGraph = 1
 
         logger = multiprocessing.log_to_stderr()
@@ -71,7 +81,8 @@ def convert(
         queue = multiprocessing.Manager().Queue()
 
         process = multiprocessing.Process(
-            target=convert_unsafe, args=(neutron_converter, tflite_model, cctx, queue)
+            target=convert_unsafe,
+            args=(self.neutron_converter, tflite_model, cctx, queue),
         )
         process.start()
         process.join()  # waits until the subprocess is complete
diff --git a/backends/nxp/backend/neutron_target_spec.py b/backends/nxp/backend/neutron_target_spec.py
new file mode 100644
index 00000000000..44399982e29
--- /dev/null
+++ b/backends/nxp/backend/neutron_target_spec.py
@@ -0,0 +1,64 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Target Spec for the NXP Neutron NPU
+
+from enum import Enum
+
+from executorch.backends.nxp.backend.neutron_converter_manager import (
+    NeutronConverterManager,
+)
+
+
+class NeutronHWVersion(Enum):
+    N1 = 1
+    N3 = 2
+
+
+class NeutronTargetSpec:
+    """
+    The functionality for probing the properties of Neutron Target.
+    """
+
+    def __init__(self, target: str, neutron_converter_flavor: str):
+
+        converter_manager = NeutronConverterManager(neutron_converter_flavor)
+        converter_manager.verify_target(target)
+        neutron_converter = converter_manager.get_converter()
+        self.neutron_target = neutron_converter.getNeutronTarget(target)
+
+        if self.is_subsystem():
+            raise ValueError(
+                f"Target `{target}` is not a neutron-C target. Only MCU targets are supported at the moment."
+            )
+
+        if self.get_hw_version() != NeutronHWVersion.N3:
+            raise ValueError(
+                f"Target `{target}` contains unsupported HW version. Only N3/N3+ targets are supported at the moment."
+            )
+
+    # Target name.
+    def get_name(self) -> str:
+        return self.neutron_target.name
+
+    # Whether the target has subsystem (Neutron-S) or not (Neutron-C).
+    def is_subsystem(self) -> bool:
+        return self.neutron_target.subsystem
+
+    # Number of compute units.
+    def get_num_units(self) -> int:
+        return self.neutron_target.numUnits
+
+    # Number of compute pipelines.
+    def get_num_pipes(self) -> int:
+        return self.neutron_target.numPipes
+
+    # Number of compute MACs.
+    def get_num_macs(self) -> int:
+        return self.neutron_target.numMacs
+
+    # Neutron compute block hardware version.
+    def get_hw_version(self) -> NeutronHWVersion:
+        return NeutronHWVersion(self.neutron_target.hwVersion)
diff --git a/backends/nxp/edge_passes/neutron_edge_pass_manager.py b/backends/nxp/edge_passes/neutron_edge_pass_manager.py
index ec46070ac31..5ce23138720 100644
--- a/backends/nxp/edge_passes/neutron_edge_pass_manager.py
+++ b/backends/nxp/edge_passes/neutron_edge_pass_manager.py
@@ -10,6 +10,10 @@
     MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass,
 )
 from executorch.backends.nxp.edge_passes.neutron_edge_pass import NeutronEdgePass
+
+from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
+    RemoveIOQuantOpsPass,
+)
 from executorch.exir import EdgeProgramManager
 from executorch.exir.program._program import (
     _get_updated_graph_signature,
@@ -24,7 +28,9 @@
 
 class NeutronEdgePassManager(PassManager):
 
-    def __init__(self, passes: list[NeutronEdgePass] = None):
+    def __init__(
+        self, passes: list[NeutronEdgePass] = None, remove_io_quant_ops: bool = False
+    ):
         passes: list[NeutronEdgePass] = passes or [
             MoveLeadingAuxiliaryOperatorIntoSeparateQDQClusterPass(),
             MoveTrailingAuxiliaryOperatorIntoSeparateQDQClusterPass(),
@@ -35,6 +41,8 @@ def __init__(self, passes: list[NeutronEdgePass] = None):
             steps=10,  # Empirical value. At most 10 cycles of passes will be run.
         )
 
+        self.remove_io_quant_ops = remove_io_quant_ops
+
     def _transform_graph_module(self, module: nn.Module) -> PassResult:
         """Apply the passes to a single graph module."""
         pass_result: PassResult = super().__call__(module)
@@ -78,12 +86,17 @@ def __call__(self, epm: EdgeProgramManager) -> EdgeProgramManager:
 
             new_programs[name] = new_program
 
-        if len(new_programs) == 0:
-            # No passes were run, return the old EdgeProgramManager.
-            return epm
+        result = epm
 
-        else:
-            # Return a new EdgeProgramManager with the updated programs.
-            return EdgeProgramManager(
+        if len(new_programs) > 0:
+            # Use a new EdgeProgramManager with the updated programs if any update was performed.
+            result = EdgeProgramManager(
                 new_programs, copy.deepcopy(epm._config_methods), epm.compile_config
             )
+
+        if self.remove_io_quant_ops:
+            result = result.transform(
+                [RemoveIOQuantOpsPass(edge_program_manager=result)]
+            )
+
+        return result
diff --git a/backends/nxp/neutron_partitioner.py b/backends/nxp/neutron_partitioner.py
index 5bcdee0f8b6..965ad41309b 100644
--- a/backends/nxp/neutron_partitioner.py
+++ b/backends/nxp/neutron_partitioner.py
@@ -8,7 +8,7 @@
 import logging
 import operator
 from dataclasses import dataclass
-from typing import Dict, final, List, Mapping
+from typing import final, Mapping
 
 import torch
 
@@ -18,12 +18,13 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
 from torch.export.exported_program import ExportedProgram
-from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx import Graph
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition
 from torch.fx.passes.operator_support import OperatorSupportBase
 from torch.nn import Parameter
 from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import *  # noqa F403
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.nxp_backend import NeutronBackend
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
@@ -34,6 +35,9 @@
 from executorch.exir.backend.utils import tag_constant_data
 from executorch.exir.dialects._ops import ops as exir_ops
 
+NXP_DO_NOT_DELEGATE = "NXP_DO_NOT_DELEGATE"
+NXP_DELEGATION_TAG = "delegation_tag"
+
 
 class QDQClusterRecognizer:
     """
@@ -60,7 +64,7 @@ class QDQCluster:
         """
 
         compute_node: torch.fx.Node
-        ops: List[torch.fx.Node]
+        ops: list[torch.fx.Node]
 
     QUANTIZE_OPERATORS = [
         exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
@@ -93,7 +97,7 @@ def is_dequant_node(node: torch.fx.Node) -> bool:
     def is_auxiliary_node(node: torch.fx.Node) -> bool:
         return node.target in QDQClusterRecognizer.AUXILIARY_OPS
 
-    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Return the list of nodes representing the input part of the QDQ cluster of the node `node`.
         Those are various dequantization nodes (see DEQUANTIZE_OPERATORS) optionally followed by auxiliary
@@ -121,7 +125,7 @@ def get_qdq_cluster_input_part(self, node: torch.fx.Node) -> List[torch.fx.Node]
         logging.debug(f"Dequant Cluster for {node} is: {qdq_cluster}")
         return qdq_cluster
 
-    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Returns the list of nodes representing the output part of the QDQ cluster of the `node`.
         Those are various quantize nodes (see QUANTIZE_OPERATORS) preceded by auxiliary nodes.
@@ -151,7 +155,7 @@ def get_qdq_cluster_output_part(self, node: torch.fx.Node) -> List[torch.fx.Node
         logging.debug(f"Quant Cluster for {node} is {qdq_cluster}")
         return qdq_cluster
 
-    def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
+    def get_qdq_cluster(self, node: torch.fx.Node) -> list[torch.fx.Node]:
         """
         Returns the QDQ cluster of the operator, if quantized. If operator is not quantized, returns empty list.
         """
@@ -163,7 +167,7 @@ def get_qdq_cluster(self, node: torch.fx.Node) -> List[torch.fx.Node]:
         else:
             return []
 
-    def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
+    def tag_nodes(self, nodes: list[torch.fx.Node], cluster_name: str) -> None:
         """
         Tags a node and its related dequant and quant nodes with a specified cluster name
         """
@@ -171,7 +175,7 @@ def tag_nodes(self, nodes: List[torch.fx.Node], cluster_name: str) -> None:
             logging.info(f"Tagging node {node} as {cluster_name}")
             node.meta["cluster"] = cluster_name
 
-    def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
+    def tag_qdq_clusters(self, nodes: list[torch.fx.Node]):
         """
         Identifies QDQ clusters and tag them based on compute operation inside.
         """
@@ -197,6 +201,7 @@ def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
     exir_ops.edge.aten.avg_pool2d.default: AvgPool2dConverter,  # noqa F405
     exir_ops.edge.aten.cat.default: CatConverter,  # noqa F405
     exir_ops.edge.aten.clone.default: CloneConverter,  # noqa F405
+    exir_ops.edge.dim_order_ops._clone_dim_order.default: CloneConverter,  # noqa F405
     exir_ops.edge.aten.constant_pad_nd.default: ConstantPadNDConverter,  # noqa F405
     exir_ops.edge.aten.convolution.default: ConvolutionConverter,  # noqa F405
     exir_ops.edge.aten.hardtanh.default: HardTanhConverter,  # noqa F405
@@ -206,6 +211,7 @@ def tag_qdq_clusters(self, nodes: List[torch.fx.Node]):
     exir_ops.edge.aten.mm.default: MMConverter,  # noqa F405
     exir_ops.edge.aten.relu.default: ReLUConverter,  # noqa F405
     exir_ops.edge.aten._softmax.default: SoftmaxConverter,  # noqa F405
+    exir_ops.edge.aten.sub.Tensor: SubTensorConverter,  # noqa F405
     exir_ops.edge.aten.tanh.default: TanhConverter,  # noqa F405
     exir_ops.edge.aten.view_copy.default: ViewCopyConverter,  # noqa F405
     exir_ops.edge.aten.sigmoid.default: SigmoidConverter,  # noqa F405
@@ -216,14 +222,14 @@ class NeutronSupportedOperators(OperatorSupportBase):
 
     def __init__(
         self,
-        qdq_clusters: Dict[str, QDQClusterRecognizer.QDQCluster],
-        target: Target,
-        operators_not_to_delegate: List[str],
+        qdq_clusters: dict[str, QDQClusterRecognizer.QDQCluster],
+        neutron_target_spec: NeutronTargetSpec,
+        operators_not_to_delegate: list[str],
         parameters_mapping: dict[str, Parameter],
         custom_delegation_options: CustomDelegationOptions,
     ):
         self.qdq_clusters = qdq_clusters
-        self.target = target
+        self.neutron_target_spec = neutron_target_spec
         self.operators_not_to_delegate = operators_not_to_delegate
         self.parameters_mapping = parameters_mapping
         self.custom_delegation_options = custom_delegation_options
@@ -246,6 +252,11 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool:
         """
         Operator checking function for compute nodes.
         """
+
+        if hasattr(node, "meta") and node.meta.get(NXP_DO_NOT_DELEGATE, False):
+            # The delegation of this node has been prohibited.
+            return False
+
         if not self.is_node_delegatable(node):
             return False
 
@@ -260,7 +271,7 @@ def _is_node_supported_compute(self, node: torch.fx.node.Node) -> bool:
             # TODO: `view_copy` node should be delegated only if it's not the only operator in the cluster.
             node_converter.is_supported(
                 node,
-                self.target,
+                self.neutron_target_spec,
                 self.parameters_mapping,
                 self.custom_delegation_options,
             )
@@ -296,35 +307,58 @@ def is_node_supported(
 class NeutronPartitioner(Partitioner):
     def __init__(
         self,
-        compile_spec: List[CompileSpec],
+        compile_spec: list[CompileSpec],
         custom_delegation_options: CustomDelegationOptions | None = None,
     ) -> None:
         self.delegation_spec = DelegationSpec(NeutronBackend.__name__, compile_spec)
         self.custom_delegation_options = (
             custom_delegation_options or CustomDelegationOptions()
         )
+        target = self.delegation_spec[1][2].value.decode()
+        converter_flavor = self.delegation_spec[1][3].value.decode()
+        self.neutron_target_spec = NeutronTargetSpec(target, converter_flavor)
+
+    def validate_partitioning_result(
+        self,
+        graph: Graph,
+        partition_list: list[Partition],
+        custom_delegation_options: CustomDelegationOptions,
+    ) -> bool:
+        all_delegated_nodes = {
+            node for partition in partition_list for node in partition.nodes
+        }
+        partitioning_valid = True
+        for node in graph.nodes:
+            if (
+                node in all_delegated_nodes
+                and hasattr(node, "target")
+                and node.target in supported_ops
+            ):
+                if not supported_ops[node.target].supports_partitioning_result(
+                    node, partition_list, custom_delegation_options
+                ):
+                    # This node is not supported within its partition. Exclude it from delegation in the future.
+                    partitioning_valid = False
+                    node.meta[NXP_DO_NOT_DELEGATE] = True
+
+        return partitioning_valid
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
         # subgraphs containing the nodes with the tags
         logging.info("NeutronPartitioner::partition")
         partition_tags = {}
+        partition_list = []
 
         graph_module = exported_program.graph_module
         nodes = list(graph_module.graph.nodes)
 
         qdq_cluster_recognizer = QDQClusterRecognizer()
         qdq_cluster_recognizer.tag_qdq_clusters(nodes)
+
         graph_module.recompile()
 
-        target = None
-        operators_not_to_delegate = ""
-        for spec in self.delegation_spec.compile_specs:
-            if spec.key == "target":
-                target = Target(spec.value.decode())
-            if spec.key == "operators_not_to_delegate":
-                operators_not_to_delegate = spec.value.decode().split(",")
-        assert target is not None
+        operators_not_to_delegate = self.delegation_spec[1][4].value.decode().split(",")
         logging.info(f"Operators not to delegate: {operators_not_to_delegate}")
 
         parameters_mapping = EdgeProgramToIRConverter.map_inputs_to_parameters(
@@ -334,7 +368,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             exported_program.graph_module,
             NeutronSupportedOperators(
                 qdq_cluster_recognizer.cluster_map,
-                target,
+                self.neutron_target_spec,
                 operators_not_to_delegate,
                 parameters_mapping,
                 self.custom_delegation_options,
@@ -342,11 +376,24 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             allows_single_node_partition=True,
         )
 
-        partition_list = capability_partitioner.propose_partitions()
+        iteration_limit = len(exported_program.graph.nodes)
+        for _ in range(iteration_limit):
+            # Run the partitioning.
+            partition_list = capability_partitioner.propose_partitions()
+
+            # Check if the nodes support the partitioning result. Mark the problematic nodes with `NXP_DO_NOT_DELEGATE`.
+            partitioning_valid = self.validate_partitioning_result(
+                exported_program.graph, partition_list, self.custom_delegation_options
+            )
+            if partitioning_valid:
+                # The result of the partitioning is fine
+                break
+
+        # Mark the partitions in the node `meta` attribute.
         for partition in partition_list:
             for node in partition.nodes:
                 delegation_tag = f"tag{partition.id}"
-                node.meta["delegation_tag"] = delegation_tag
+                node.meta[NXP_DELEGATION_TAG] = delegation_tag
                 partition_tags[delegation_tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
index c801eefec81..44e9a19d9f2 100644
--- a/backends/nxp/nxp_backend.py
+++ b/backends/nxp/nxp_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,11 +18,11 @@
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import Target
 from executorch.backends.nxp.backend.ir.tensor_formatting import TensorFormat
 from executorch.backends.nxp.backend.neutron_converter_manager import (
     NeutronConverterManager,
 )
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from executorch.backends.nxp.neutron_node_extraction import (
     extract_artifacts_from_neutron_node,
     NeutronNodeArtifacts,
@@ -36,9 +36,9 @@
 
 
 class NeutronCompileSpecBuilder:
+    config: NeutronTargetSpec
 
     def __init__(self):
-        self.config: Target = None
         self.compile_spec: List[CompileSpec] = []
         self.compiler_flags = []
         self.output_format = None
@@ -64,18 +64,13 @@ def neutron_compile_spec(
         Args:
             config: Neutron accelerator configuration, e.g. "imxrt700"
             neutron_converter_flavor: Flavor of the neutron-converter module to use. Neutron-converter module named "
-             "'neutron_converter_SDK_25_06' has flavor 'SDK_25_06'.
+             "'neutron_converter_SDK_25_09' has flavor 'SDK_25_09'.
             extra_flags: Extra flags for the Neutron compiler
             operators_not_to_delegate: List of operators that should not be delegated
         """
-        try:
-            self.config = Target(config)
-        except ValueError:
-            raise ValueError(
-                f"Config `{config}` is not a valid target. Must be one of `{Target.values()}`."
-            )
 
         self.neutron_converter_flavor = neutron_converter_flavor
+        self.config = NeutronTargetSpec(config, neutron_converter_flavor)
 
         assert (
             self.output_format is None
@@ -101,7 +96,7 @@ def build(self):
             self.compile_spec += [
                 CompileSpec("output_format", "tflite".encode()),
                 CompileSpec("compile_flags", " ".join(self.compiler_flags).encode()),
-                CompileSpec("target", self.config.value.encode()),
+                CompileSpec("target", self.config.get_name().encode()),
                 CompileSpec(
                     "neutron_converter_flavor", self.neutron_converter_flavor.encode()
                 ),
@@ -187,10 +182,11 @@ def preprocess(  # noqa C901
             # Convert the edge program to TFLite.
             tflite_model, io_formats = EdgeProgramToIRConverter().convert_program(
                 edge_program,
+                neutron_target_spec=NeutronTargetSpec(target, neutron_converter_flavor),
             )
 
-            neutron_model = NeutronConverterManager().convert(
-                tflite_model, target, neutron_converter_flavor
+            neutron_model = NeutronConverterManager(neutron_converter_flavor).convert(
+                tflite_model, target
             )
 
             # Dump the tflite file if logging level is enabled
diff --git a/backends/nxp/quantizer/neutron_quantizer.py b/backends/nxp/quantizer/neutron_quantizer.py
index d3f84144aa3..2681e221869 100644
--- a/backends/nxp/quantizer/neutron_quantizer.py
+++ b/backends/nxp/quantizer/neutron_quantizer.py
@@ -4,8 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List, Optional, Tuple, Union
-
 import torch
 
 from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import (
@@ -27,6 +25,8 @@
     LinearPattern,
     MaxPoolPattern,
     MeanDimPattern,
+    MmPattern,
+    NodeArgsIdx,
     PadPattern,
     PermutePattern,
     QuantizationPattern,
@@ -36,6 +36,7 @@
     SharedSpecPattern,
     SigmoidPattern,
     SoftMaxPattern,
+    SubTensorPattern,
     TanhInPlacePattern,
     TanhPattern,
     ViewPattern,
@@ -106,13 +107,13 @@ def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
                 )
 
             def annotate_inputs(
-                inputs: Union[
-                    List[Tuple[fx.Node, int]],
-                    List[Tuple[fx.Node, int, DerivedQuantizationSpec],],
-                ],
-                spec: Optional[QuantizationSpec],
+                inputs: (
+                    list[tuple[fx.Node, NodeArgsIdx]]
+                    | list[tuple[fx.Node, NodeArgsIdx, DerivedQuantizationSpec]]
+                ),
+                spec: QuantizationSpec | None,
             ) -> None:
-                for node, idx, *custom_spec in inputs:
+                for node, args_idx, *custom_spec in inputs:
                     # pyre-ignore[16]: no attribute
                     annotation = node.meta.get(
                         Q_ANNOTATION_KEY,
@@ -120,10 +121,10 @@ def annotate_inputs(
                     )
                     arg = (
                         # pyre-ignore[16]: no attribute
-                        node.args[idx]
-                        if isinstance(idx, int)
+                        node.args[args_idx.idx]
+                        if args_idx.inner_idx is None
                         # pyre-ignore[16]: no attribute
-                        else node.args[idx[0]][idx[1]]
+                        else node.args[args_idx.idx][args_idx.inner_idx]
                     )
                     annotation.input_qspec_map[arg] = (
                         custom_spec[0] if custom_spec else spec
@@ -131,32 +132,18 @@ def annotate_inputs(
                     # pyre-ignore[16]: no attribute
                     node.meta[Q_ANNOTATION_KEY] = annotation
 
-            def annotate_weights_or_biases(
-                weights_or_biases: List[Tuple[fx.Node, int]],
-                spec: Optional[QuantizationSpec],
-            ) -> None:
-                for node, idx, *custom_spec in weights_or_biases:
-                    annotation = node.meta.get(
-                        Q_ANNOTATION_KEY,
-                        QuantizationAnnotation(_annotated=True),
-                    )
-                    annotation.input_qspec_map[node.args[idx]] = (
-                        custom_spec[0] if custom_spec else spec
-                    )
-                    node.meta[Q_ANNOTATION_KEY] = annotation
-
             # pyre-ignore[6]: incompatible parameter type
             annotate_inputs(anchors.inputs, input_act_qspec)
-            annotate_weights_or_biases(anchors.weights, weight_qspec)
+            annotate_inputs(anchors.weights, weight_qspec)
             # pyre-ignore[6]: incompatible parameter type
-            annotate_weights_or_biases(anchors.biases, bias_qspec)
+            annotate_inputs(anchors.biases, bias_qspec)
         return model
 
     def validate(self, model: fx.GraphModule) -> None:
         pass
 
     @classmethod
-    def get_supported_operators(cls) -> List[OperatorConfig]:
+    def get_supported_operators(cls) -> list[OperatorConfig]:
         return []
 
 
@@ -195,12 +182,7 @@ def get_supported_operators(cls) -> List[OperatorConfig]:
 
 class NeutronQuantizer(ComposableQuantizer):
     def __init__(self):
-        static_qconfig = QuantizationConfig(
-            act_qspec,
-            act_qspec,
-            wgt_qspec,
-            None,
-        )
+        static_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_qspec, None)
         static_fc_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_fc_qspec, None)
         super().__init__(
             [
@@ -219,6 +201,7 @@ def __init__(self):
                 NeutronAtenQuantizer(LinearPattern(), static_fc_qconfig),
                 NeutronAtenQuantizer(MaxPoolPattern(), static_qconfig),
                 NeutronAtenQuantizer(MeanDimPattern(), static_qconfig),
+                NeutronAtenQuantizer(MmPattern(), static_qconfig),
                 NeutronAtenQuantizer(PadPattern(), static_qconfig),
                 NeutronAtenQuantizer(PermutePattern(), static_qconfig),
                 NeutronAtenQuantizer(ReluPattern(), static_qconfig),
@@ -226,6 +209,7 @@ def __init__(self):
                 NeutronAtenQuantizer(ReshapePattern(), static_qconfig),
                 NeutronAtenQuantizer(SigmoidPattern(), static_qconfig),
                 NeutronAtenQuantizer(SoftMaxPattern(), static_qconfig),
+                NeutronAtenQuantizer(SubTensorPattern(), static_qconfig),
                 NeutronAtenQuantizer(TanhPattern(), static_qconfig),
                 NeutronAtenQuantizer(TanhInPlacePattern(), static_qconfig),
                 NeutronAtenQuantizer(ViewPattern(), static_qconfig),
diff --git a/backends/nxp/quantizer/patterns.py b/backends/nxp/quantizer/patterns.py
index 651f995d570..9588ce24c9e 100644
--- a/backends/nxp/quantizer/patterns.py
+++ b/backends/nxp/quantizer/patterns.py
@@ -7,26 +7,43 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import List, Optional, Tuple, Type, Union
 
 import torch
 
 from executorch.backends.nxp.quantizer.utils import get_bias_qparams
 from torch import fx
 from torch._ops import OpOverload
+from torchao.quantization.pt2e import PerChannelMinMaxObserver
 from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
     FixedQParamsQuantizationSpec,
+    QuantizationSpec,
     SharedQuantizationSpec,
 )
 from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
 
+@dataclass
+class NodeArgsIdx:
+    """
+    Specifies indexes to args paramater of Node in node input annotation.
+
+
+    Attributes:
+        idx (int): Index to Node's args paramater (list). Selects an input Node or a list of Nodes at the index.
+        inner_idx (int): If specified, index to a list pointed by 'idx' attribute. Selects an input Node at the index.
+                         Default: None.
+    """
+
+    idx: int
+    inner_idx: int = None
+
+
 @dataclass
 class PartitionAnchors:
     """
-    All fields except output are lists of (node, args_index) pair, where node is from
-    the given partition and node.args[args_index] is an input to the partition. Assumes
+    All fields except output are lists of (node, node_args_idx) or (node, node_args_idx, quantization_spec) tuples,
+    where node is from the given partition and node.args[node_args_idx] is an input to the partition. Assumes
     a single output.
 
     Quantizer uses inputs, weights and biases for quantization annotation. The others
@@ -35,25 +52,23 @@ class PartitionAnchors:
     """
 
     # Inputs can share quantization parameters
-    inputs: List[
-        Union[
-            Tuple[fx.Node, Union[int, Tuple[int, int]]],
-            Tuple[
-                fx.Node,
-                Union[int, Tuple[int, int]],
-                SharedQuantizationSpec,
-            ],
-        ]
+    inputs: list[
+        tuple[fx.Node, NodeArgsIdx]
+        | tuple[fx.Node, NodeArgsIdx, SharedQuantizationSpec],
     ] = field(default_factory=list)
-    weights: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    biases: List[
-        Union[Tuple[fx.Node, int], Tuple[fx.Node, int, DerivedQuantizationSpec]]
+    weights: list[
+        tuple[fx.Node, NodeArgsIdx] | tuple[fx.Node, NodeArgsIdx, QuantizationSpec],
+    ] = field(default_factory=list)
+    biases: list[
+        tuple[fx.Node, NodeArgsIdx]
+        | tuple[fx.Node, NodeArgsIdx, DerivedQuantizationSpec],
+    ] = field(default_factory=list)
+    others: list[tuple[fx.Node, NodeArgsIdx]] = field(default_factory=list)
+    literals: list[tuple[fx.Node, NodeArgsIdx]] = field(default_factory=list)
+    output: list[
+        tuple[fx.Node]
+        | tuple[fx.Node, FixedQParamsQuantizationSpec | SharedQuantizationSpec],
     ] = field(default_factory=list)
-    others: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    literals: List[Tuple[fx.Node, int]] = field(default_factory=list)
-    output: List[Union[Tuple[fx.Node], Tuple[fx.Node, SharedQuantizationSpec]]] = field(
-        default_factory=list
-    )
     empty: bool = False
 
 
@@ -67,8 +82,8 @@ def partition_types(self) -> list[OpOverload]:
 
     @abstractmethod
     def get_anchors(
-        self, gm: torch.fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> Optional[PartitionAnchors]:
+        self, gm: torch.fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
         pass
 
 
@@ -80,11 +95,11 @@ class SharedSpecPattern(QuantizationPattern):
     quantization parameters (scale and zero-point).
     """
 
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
+    def partition_types(self) -> list[torch.nn.Module]:
         pass
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors | None:
         node = fused_partition[0].nodes[-1]
         assert len(fused_partition[0].input_nodes) == 1
@@ -97,7 +112,7 @@ def get_anchors(
         qspec = SharedQuantizationSpec(prev_node)
 
         return PartitionAnchors(
-            inputs=[(node, 0)],
+            inputs=[(node, NodeArgsIdx(0))],
             weights=[],
             biases=[],
             output=[
@@ -126,7 +141,7 @@ def get_anchors_for_fixed_quant_specs(
     )
 
     return PartitionAnchors(
-        inputs=[(node, 0)],
+        inputs=[(node, NodeArgsIdx(0))],
         weights=[],
         biases=[],
         output=[
@@ -154,11 +169,11 @@ def partition_types(self):
 
 
 class AddmmPattern(QuantizationPattern):
-    def partition_types(self) -> List[OpOverload]:
+    def partition_types(self) -> list[OpOverload]:
         return [torch.ops.aten.addmm.default]
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
         # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         addmm_node = fused_partition[0].nodes[-1]
@@ -176,9 +191,9 @@ def get_anchors(
         )
 
         return PartitionAnchors(
-            inputs=[(addmm_node, 1)],
-            weights=[(addmm_node, 2)],
-            biases=[(addmm_node, 0, bias_qspec)],
+            inputs=[(addmm_node, NodeArgsIdx(1))],
+            weights=[(addmm_node, NodeArgsIdx(2))],
+            biases=[(addmm_node, NodeArgsIdx(0), bias_qspec)],
             output=[(addmm_node,)],
         )
 
@@ -190,16 +205,42 @@ class AddTensorPattern(QuantizationPattern):
     Basic quantization for all inputs and output.
     """
 
-    def partition_types(self) -> List[Type[torch.nn.Module]]:
+    def partition_types(self) -> list[torch.nn.Module]:
         return [torch.ops.aten.add.Tensor]
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors | None:
+        node = fused_partition[0].nodes[-1]
+        inputs = [(node, NodeArgsIdx(0))]
+        if len(fused_partition[0].input_nodes) == 2:
+            inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))]
+
+        return PartitionAnchors(
+            inputs=inputs,
+            weights=[],
+            biases=[],
+            output=[(node,)],
+        )
+
+
+class SubTensorPattern(QuantizationPattern):
+    """
+    Quantization pattern for Sub Tensor quantization. Accepts 1 or 2 input nodes.
+
+    Basic quantization for all inputs and output.
+    """
+
+    def partition_types(self) -> list[torch.nn.Module]:
+        return [torch.ops.aten.sub.Tensor]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors | None:
         node = fused_partition[0].nodes[-1]
-        inputs = [(node, 0)]
+        inputs = [(node, NodeArgsIdx(0))]
         if len(fused_partition[0].input_nodes) == 2:
-            inputs = [(node, 0), (node, 1)]
+            inputs = [(node, NodeArgsIdx(0)), (node, NodeArgsIdx(1))]
 
         return PartitionAnchors(
             inputs=inputs,
@@ -242,13 +283,15 @@ def get_anchors(
         if quantized_input is not None:
             inputs = []
             for idx, _ in enumerate(node.args[0]):
-                inputs.append((node, (0, idx), SharedQuantizationSpec(quantized_input)))
+                inputs.append(
+                    (node, NodeArgsIdx(0, idx), SharedQuantizationSpec(quantized_input))
+                )
             outputs = [(node, SharedQuantizationSpec(quantized_input))]
 
         else:
             # No previous node was quantized => we are not able to share q-params. The conversion to IR will have to
             #  re-quantize the inputs if necessary.
-            inputs = [(node, (0, idx)) for idx in range(len(node.args[0]))]
+            inputs = [(node, NodeArgsIdx(0, idx)) for idx in range(len(node.args[0]))]
             outputs = [(node,)]
 
         return PartitionAnchors(
@@ -259,76 +302,60 @@ def get_anchors(
         )
 
 
-class Conv1dPattern(QuantizationPattern):
-    def partition_types(self) -> List[OpOverload]:
-        return [torch.ops.aten.conv1d.default]
+class ConvPattern(QuantizationPattern):
+    @abstractmethod
+    def partition_types(self) -> list[OpOverload]:
+        pass
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
-        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
-        conv1d_node = fused_partition[0].nodes[-1]
+        conv_node = fused_partition[0].nodes[-1]
 
-        bias_qspec = DerivedQuantizationSpec(
+        bias_quantization_qspec = DerivedQuantizationSpec(
             derived_from=[
-                (conv1d_node.args[0], conv1d_node),
-                (conv1d_node.args[1], conv1d_node),
+                (conv_node.args[0], conv_node),
+                (conv_node.args[1], conv_node),
             ],
             derive_qparams_fn=get_bias_qparams,
             dtype=torch.int32,
-            quant_min=-(2**31),
+            quant_min=-(2**31) + 1,
             quant_max=2**31 - 1,
-            qscheme=torch.per_tensor_affine,
+            qscheme=torch.per_channel_symmetric,
+            ch_axis=0,
+        )
+
+        weight_observer_or_fake_quant_ctr = PerChannelMinMaxObserver
+        weight_quantization_spec = QuantizationSpec(
+            dtype=torch.int8,
+            observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr,
+            quant_min=-127,
+            quant_max=127,
+            qscheme=torch.per_channel_symmetric,
+            ch_axis=0,
         )
 
         # Keep bias empty if not supplied
         bias = []
-        if len(conv1d_node.args) > 2 and conv1d_node.args[2] is not None:
-            bias = [(conv1d_node, 2, bias_qspec)]
+        if len(conv_node.args) > 2 and conv_node.args[2] is not None:
+            bias = [(conv_node, NodeArgsIdx(2), bias_quantization_qspec)]
 
         return PartitionAnchors(
-            inputs=[(conv1d_node, 0)],
-            weights=[(conv1d_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
+            inputs=[(conv_node, NodeArgsIdx(0))],
+            weights=[(conv_node, NodeArgsIdx(1), weight_quantization_spec)],
             biases=bias,
-            output=[(conv1d_node,)],
+            output=[(conv_node,)],
         )
 
 
-class Conv2dPattern(QuantizationPattern):
-    def partition_types(self) -> List[OpOverload]:
-        return [torch.ops.aten.conv2d.default]
-
-    def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
-    ) -> PartitionAnchors:
-        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
-        conv2d_node = fused_partition[0].nodes[-1]
-
-        bias_qspec = DerivedQuantizationSpec(
-            derived_from=[
-                (conv2d_node.args[0], conv2d_node),
-                (conv2d_node.args[1], conv2d_node),
-            ],
-            derive_qparams_fn=get_bias_qparams,
-            dtype=torch.int32,
-            quant_min=-(2**31),
-            quant_max=2**31 - 1,
-            qscheme=torch.per_tensor_affine,
-        )
+class Conv1dPattern(ConvPattern):
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.conv1d.default]
 
-        # Keep bias empty if not supplied
-        bias = []
-        if len(conv2d_node.args) > 2 and conv2d_node.args[2] is not None:
-            bias = [(conv2d_node, 2, bias_qspec)]
 
-        return PartitionAnchors(
-            inputs=[(conv2d_node, 0)],
-            weights=[(conv2d_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
-            biases=bias,
-            output=[(conv2d_node,)],
-        )
+class Conv2dPattern(ConvPattern):
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.conv2d.default]
 
 
 class DropoutPattern(SharedSpecPattern):
@@ -359,12 +386,12 @@ def partition_types(self):
         return [torch.ops.aten.hardtanh.default]
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors | None:
         node = fused_partition[0].nodes[-1]
 
         return PartitionAnchors(
-            inputs=[(node, 0)],
+            inputs=[(node, NodeArgsIdx(0))],
             weights=[],
             biases=[],
             output=[(node,)],
@@ -384,12 +411,12 @@ def partition_types(self):
         return [torch.ops.aten.hardtanh_.default]
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors | None:
         node = fused_partition[0].nodes[-1]
 
         return PartitionAnchors(
-            inputs=[(node, 0)],
+            inputs=[(node, NodeArgsIdx(0))],
             weights=[],
             biases=[],
             output=[(node,)],
@@ -400,13 +427,12 @@ def replacement_op(self):
 
 
 class LinearPattern(QuantizationPattern):
-    def partition_types(self) -> List[OpOverload]:
+    def partition_types(self) -> list[OpOverload]:
         return [torch.ops.aten.linear.default]
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
-        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
         linear_node = fused_partition[0].nodes[-1]
 
         bias_qspec = DerivedQuantizationSpec(
@@ -424,12 +450,11 @@ def get_anchors(
         # Keep bias empty if not supplied
         bias = []
         if len(linear_node.args) > 2:
-            bias = [(linear_node, 2, bias_qspec)]
+            bias = [(linear_node, NodeArgsIdx(2), bias_qspec)]
 
         return PartitionAnchors(
-            inputs=[(linear_node, 0)],
-            weights=[(linear_node, 1)],
-            # pyre-fixme[6]: Incompatible parameter type
+            inputs=[(linear_node, NodeArgsIdx(0))],
+            weights=[(linear_node, NodeArgsIdx(1))],
             biases=bias,
             output=[(linear_node,)],
         )
@@ -453,6 +478,23 @@ def partition_types(self):
         return [torch.ops.aten.mean.dim]
 
 
+class MmPattern(QuantizationPattern):
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.mm.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors:
+        mm_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(mm_node, NodeArgsIdx(0))],
+            weights=[(mm_node, NodeArgsIdx(1))],
+            biases=[],
+            output=[(mm_node,)],
+        )
+
+
 class PadPattern(SharedSpecPattern):
     """
     Quantizer for Pad operator.
@@ -515,7 +557,7 @@ class SoftMaxPattern(QuantizationPattern):
     The quantization of Softmax output is fixed to scale 1/256, zero point -128, dtype int8.
     """
 
-    def partition_types(self) -> List[OpOverload]:
+    def partition_types(self) -> list[OpOverload]:
         return [torch.ops.aten.softmax.int]
 
     def get_anchors(
@@ -526,33 +568,33 @@ def get_anchors(
         )
 
 
-class TanhPattern(QuantizationPattern):
+class SigmoidPattern(QuantizationPattern):
     """
-    Quantizer for Tanh operator.
+    Quantizer for Sigmoid operator.
 
-    The quantization of Tanh output is fixed to scale 1/128, zero point 0, dtype int8.
+    The quantization of Sigmoid output is fixed to scale 1/256, zero point -128, dtype int8.
     """
 
-    def partition_types(self):
-        return [torch.ops.aten.tanh.default]
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.sigmoid.default]
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
         return get_anchors_for_fixed_quant_specs(
-            fused_partition, scale=1.0 / 128.0, zero_point=0
+            fused_partition, scale=1.0 / 256.0, zero_point=-128
         )
 
 
-class TanhInPlacePattern(QuantizationPattern):
+class TanhPattern(QuantizationPattern):
     """
-    Quantizer for inplace version of Tanh operator (torch.tanh_).
+    Quantizer for Tanh operator.
 
     The quantization of Tanh output is fixed to scale 1/128, zero point 0, dtype int8.
     """
 
     def partition_types(self):
-        return [torch.ops.aten.tanh_.default]
+        return [torch.ops.aten.tanh.default]
 
     def get_anchors(
         self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
@@ -562,19 +604,19 @@ def get_anchors(
         )
 
 
-class SigmoidPattern(QuantizationPattern):
+class TanhInPlacePattern(QuantizationPattern):
     """
-    Quantizer for Sigmoid operator.
+    Quantizer for inplace version of Tanh operator (torch.tanh_).
 
-    The quantization of Sigmoid output is fixed to scale 1/256, zero point -128, dtype int8.
+    The quantization of Tanh output is fixed to scale 1/128, zero point 0, dtype int8.
     """
 
-    def partition_types(self) -> List[OpOverload]:
-        return [torch.ops.aten.sigmoid.default]
+    def partition_types(self):
+        return [torch.ops.aten.tanh_.default]
 
     def get_anchors(
-        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
     ) -> PartitionAnchors:
         return get_anchors_for_fixed_quant_specs(
-            fused_partition, scale=1.0 / 256.0, zero_point=-128
+            fused_partition, scale=1.0 / 128.0, zero_point=0
         )
diff --git a/backends/nxp/quantizer/utils.py b/backends/nxp/quantizer/utils.py
index ed94183c2db..12c722a8ab3 100644
--- a/backends/nxp/quantizer/utils.py
+++ b/backends/nxp/quantizer/utils.py
@@ -49,7 +49,7 @@ def get_bias_qparams(
     act_scale, _ = obs_or_fqs[0].calculate_qparams()
     weight_scale, _ = obs_or_fqs[1].calculate_qparams()
     bias_scale = act_scale * weight_scale
-    bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int32)
+    bias_zero_point = torch.zeros_like(bias_scale, dtype=torch.int64)
     return bias_scale, bias_zero_point
 
 
diff --git a/backends/nxp/requirements-tests-eiq.txt b/backends/nxp/requirements-tests-eiq.txt
index 896d2b8c07e..1fccf010e86 100644
--- a/backends/nxp/requirements-tests-eiq.txt
+++ b/backends/nxp/requirements-tests-eiq.txt
@@ -1,2 +1,2 @@
 --index-url https://eiq.nxp.com/repository
-neutron_converter_SDK_25_06
+neutron_converter_SDK_25_09
diff --git a/backends/nxp/runtime/NeutronDriver.h b/backends/nxp/runtime/NeutronDriver.h
index 5ae4c3a3ff9..5c47bd74eab 100644
--- a/backends/nxp/runtime/NeutronDriver.h
+++ b/backends/nxp/runtime/NeutronDriver.h
@@ -18,22 +18,6 @@ extern "C" {
 
 #include "NeutronErrors.h"
 
-/* Neutron Driver error category codes */
-typedef enum ERROR_CATEGORY_DRIVER {
-  ERROR_CATEGORY_DRIVER_GENERIC, /* Generic error category */
-  ERROR_CATEGORY_DRIVER_UNSUPPORTED, /* Unsupported function */
-  ERROR_CATEGORY_DRIVER_UCODE, /* Microcode bad magic or version incompatible.
-                                */
-  ERROR_CATEGORY_DRIVER_INVALID, /* Invalid arguments */
-  ERROR_CATEGORY_DRIVER_BAD_HANDLE, /* Bad inference handle */
-  ERROR_CATEGORY_DRIVER_NO_MEMORY, /* Not enough memory */
-  ERROR_CATEGORY_DRIVER_INTERNAL_FAULT, /* Internal error */
-  ERROR_CATEGORY_DRIVER_UNKNOWN_ARCH, /* Unknown architecture */
-  ERROR_CATEGORY_DRIVER_TRACE_NOT_RUN, /* Tracing did not run, but trace buffer
-                                          was requested. */
-  ERROR_CATEGORY_DRIVER_TIMEOUT /* Timeout error. */
-} ERROR_CATEGORY_DRIVER;
-
 /// Trace configuration to enable kernel level tracing.
 #define TRACE_CONFIG_KERNEL_LEVEL (1U << 0)
 
@@ -169,6 +153,12 @@ NeutronError neutronCustomExec(
     NeutronModelHandle hdl,
     const NeutronDataConfig* neutron_dcfg);
 
+/// - Setup the input and output data ptr to use Neutron memory area.
+/// - The input and ouput data ptr is stored in neutron_dcfg.
+NeutronError neutronDataSetup(
+    NeutronModelHandle hdl,
+    NeutronDataConfig* neutron_dcfg);
+
 /// - Prepare Neutron execution for a model with the given configuration.
 /// - This function only prepares the execution by transferring the parameters
 /// to the firmware.
@@ -245,6 +235,29 @@ void* neutronMemAlloc(size_t alignment, size_t size);
 /// - This function is only available for Neutron-S in the Linux environment.
 void neutronMemFree(void* ptr);
 
+/// - Allocates size bytes large buffer in DDR to be used for specialized
+/// kernels (e.g. batch matmul)
+///   Uses Linux CMA allocator
+NeutronError allocateBuffer(uint64_t size, void** pBuffer, bool userspace);
+
+/// - Frees buffer allocated via allocateBuffer function
+NeutronError releaseBuffer(void* buffer);
+
+/// - Clean/flush cache for DDR allocated buffer
+///   TODO: rename function as "cleanCache" to satisfy neutron-software naming
+///   convention
+NeutronError clean_cache(const void* addr, int size);
+
+/// - Function for calling firmware for specialized kernel (matmul)
+NeutronError matmul(
+    const void* info,
+    int sizeInfo,
+    const void* in,
+    int sizeIn,
+    const void* out,
+    int sizeOut,
+    int idxSlot);
+
 /// Other functions to control the state of driver/firmware.
 #ifdef __cplusplus
 }
diff --git a/backends/nxp/runtime/NeutronErrors.h b/backends/nxp/runtime/NeutronErrors.h
index 5141c4bb4c5..071db8b44be 100644
--- a/backends/nxp/runtime/NeutronErrors.h
+++ b/backends/nxp/runtime/NeutronErrors.h
@@ -39,6 +39,32 @@ typedef enum ERROR_COMPONENT_ID {
   ERROR_COMPONENT_DRIVER = 0x3
 } ERROR_COMPONENT_ID;
 
+/* Neutron Firmware error category codes */
+typedef enum ERROR_CATEGORY_FW {
+  ERROR_CATEGORY_FW_GENERIC, /* Generic error category */
+  ERROR_CATEGORY_FW_UCODE, /* Microcode bad magic or version incompatible. */
+  ERROR_CATEGORY_FW_BUFFER_OVERFLOW, /* Buffer overflow error category */
+  ERROR_CATEGORY_FW_NULL_POINTER, /* Pointer is null */
+  ERROR_CATEGORY_FW_INTR_ERROR, /* Interrupt triggering error */
+  ERROR_CATEGORY_FW_DMAPI_ERROR, /* DM API parameter error */
+} ERROR_CATEGORY_FW;
+
+/* Neutron Driver error category codes */
+typedef enum ERROR_CATEGORY_DRIVER {
+  ERROR_CATEGORY_DRIVER_GENERIC, /* Generic error category */
+  ERROR_CATEGORY_DRIVER_UNSUPPORTED, /* Unsupported function */
+  ERROR_CATEGORY_DRIVER_UCODE, /* Microcode bad magic or version incompatible.
+                                */
+  ERROR_CATEGORY_DRIVER_INVALID, /* Invalid arguments */
+  ERROR_CATEGORY_DRIVER_BAD_HANDLE, /* Bad inference handle */
+  ERROR_CATEGORY_DRIVER_NO_MEMORY, /* Not enough memory */
+  ERROR_CATEGORY_DRIVER_INTERNAL_FAULT, /* Internal error */
+  ERROR_CATEGORY_DRIVER_UNKNOWN_ARCH, /* Unknown architecture */
+  ERROR_CATEGORY_DRIVER_TRACE_NOT_RUN, /* Tracing did not run, but trace buffer
+                                          was requested. */
+  ERROR_CATEGORY_DRIVER_TIMEOUT /* Timeout error. */
+} ERROR_CATEGORY_DRIVER;
+
 /// Retrieve component name as string from NeutronError code.
 char* getNeutronErrorComponent(NeutronError ne);
 
diff --git a/backends/nxp/runtime/targets.bzl b/backends/nxp/runtime/targets.bzl
index 1eacbbe0a2b..3214761a9cb 100644
--- a/backends/nxp/runtime/targets.bzl
+++ b/backends/nxp/runtime/targets.bzl
@@ -1,20 +1,25 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
 
 def define_common_targets():
     runtime.cxx_library(
-        name = "nxp_backend",
+        name = "nxp_backend_base",
         srcs = ["NeutronBackend.cpp"],
-        headers = ["NeutronDriver.h", "NeutronErrors.h"],
-        compatible_with = ["ovr_config//cpu:arm32-embedded", "@fbsource//arvr/firmware/projects/smartglasses/config:embedded-mcu-rtos"],
-        # Neutron runtime needs to compile with executor as whole
-        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        exported_headers = [
+            "NeutronDriver.h",
+            "NeutronErrors.h",
+        ],
         link_whole = True,
         # Constructor needed for backend registration.
         compiler_flags = ["-Wno-global-constructors", "-fno-rtti", "-DNO_HEAP_USAGE"],
-        visibility = ["@EXECUTORCH_CLIENTS"],
+        labels = [ci.skip_target()],
+        visibility = [
+            "//executorch/backends/nxp/runtime/fb:nxp_fb_backend",
+            "//executorch/backends/nxp/runtime/fb:nxp_hifi_fb_backend",
+            "@EXECUTORCH_CLIENTS",
+        ],
         deps = [
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
-            "fbsource//arvr/third-party/toolchains/nxp-sdk/2.16.0/middleware/eiq/executorch/third-party/neutron/rt700:libNeutron",
         ],
     )
diff --git a/backends/nxp/tests/TARGETS b/backends/nxp/tests/TARGETS
index f492111aff2..c8ccd5fe900 100644
--- a/backends/nxp/tests/TARGETS
+++ b/backends/nxp/tests/TARGETS
@@ -1,3 +1,4 @@
+load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 
@@ -50,5 +51,9 @@ python_pytest(
         "//executorch/backends/nxp:neutron_backend",
         ":executorch_pipeline",
         ":models",
-    ]
+    ],
+    labels = [
+        "local_only",
+        ci.skip_test(),
+    ],
 )
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
index f2f625ad0c8..09bceb2b0d3 100644
--- a/backends/nxp/tests/executorch_pipeline.py
+++ b/backends/nxp/tests/executorch_pipeline.py
@@ -15,9 +15,6 @@
 from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
     NeutronEdgePassManager,
 )
-from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
-    RemoveIOQuantOpsPass,
-)
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
 from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
@@ -38,9 +35,9 @@ class ModelInputSpec:
     dtype: torch.dtype = torch.float32
 
 
-def _quantize_model(model, calibration_inputs: list[tuple[torch.Tensor, ...]]):
-    quantizer = NeutronQuantizer()
-
+def _quantize_model(
+    model, quantizer, calibration_inputs: list[tuple[torch.Tensor, ...]]
+):
     m = prepare_pt2e(model, quantizer)
     for data in calibration_inputs:
         m(*data)
@@ -88,9 +85,10 @@ def to_quantized_edge_program(
         [tuple[ModelInputSpec, ...]], list[tuple[torch.Tensor, ...]]
     ] = get_random_calibration_inputs,
     target="imxrt700",
-    neutron_converter_flavor="SDK_25_06",
+    neutron_converter_flavor="SDK_25_09",
     remove_quant_io_ops=False,
     custom_delegation_options=CustomDelegationOptions(),  # noqa B008
+    get_quantizer_fn=lambda: NeutronQuantizer(),
 ) -> EdgeProgramManager:
     calibration_inputs = get_calibration_inputs_fn(to_model_input_spec(input_spec))
 
@@ -102,7 +100,9 @@ def to_quantized_edge_program(
     exir_program_aten = torch.export.export(model, example_input, strict=True)
 
     exir_program_aten__module_quant = _quantize_model(
-        exir_program_aten.module(), calibration_inputs
+        exir_program_aten.module(),
+        get_quantizer_fn(),
+        calibration_inputs,
     )
 
     edge_compile_config = EdgeCompileConfig(_check_ir_validity=False)
@@ -112,7 +112,9 @@ def to_quantized_edge_program(
         edge_compile_config=edge_compile_config,
     )
 
-    edge_program_manager = NeutronEdgePassManager()(edge_program_manager)
+    edge_program_manager = NeutronEdgePassManager(
+        remove_io_quant_ops=remove_quant_io_ops
+    )(edge_program_manager)
 
     compile_spec = generate_neutron_compile_spec(
         target,
@@ -122,11 +124,6 @@ def to_quantized_edge_program(
     partitioner = NeutronPartitioner(compile_spec, custom_delegation_options)
     edge_program_manager = edge_program_manager.to_backend(partitioner)
 
-    if remove_quant_io_ops:
-        edge_program_manager = edge_program_manager.transform(
-            [RemoveIOQuantOpsPass(edge_program_manager=edge_program_manager)]
-        )
-
     return edge_program_manager
 
 
diff --git a/backends/nxp/tests/executors.py b/backends/nxp/tests/executors.py
index afdb15af106..632e3da055f 100644
--- a/backends/nxp/tests/executors.py
+++ b/backends/nxp/tests/executors.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 NXP
+# Copyright 2023-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -18,10 +18,8 @@
     create_channels_first_to_channels_last_permutation,
     create_channels_last_to_channels_first_permutation,
 )
-from executorch.backends.nxp.backend.ir.converter.node_converter import (
-    NodeConverter,
-    Target,
-)
+from executorch.backends.nxp.backend.ir.converter.node_converter import NodeConverter
+from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec
 from torch.export import ExportedProgram
 from torch.fx import Node
 from torch.fx.graph import Graph
@@ -196,6 +194,11 @@ def compare_output_arrays(
 
     assert tfl_output.shape == edge_output.shape, "Output shapes don't match!"
 
+    if (max_diff := np.abs(np.max(tfl_output - edge_output))) > 0.0:
+        logger.w(
+            f"Maximum absolute difference of the tensor '{output_name}': '{max_diff}'"
+        )
+
     assert np.allclose(
         tfl_output, edge_output, rtol=rtol, atol=atol, equal_nan=True
     ), f"Output values of the `{output_name}` tensor don't match!"
@@ -365,10 +368,16 @@ def convert_run_compare(
 
 
 def graph_contains_any_of_ops(graph: Graph, ops: list) -> bool:
-    return any(node.target in ops for node in graph.nodes)
+    return graph_contains_any(
+        graph, condition=lambda n: hasattr(n, "target") and n.target in ops
+    )
+
+
+def graph_contains_any(graph: Graph, condition: Callable[[Node], bool]) -> bool:
+    return any(map(condition, graph.nodes))
 
 
-target_support_check_function = Callable[[Node, Target], bool]
+target_support_check_function = Callable[[Node, NeutronTargetSpec], bool]
 
 
 class OverrideTargetSupportCheck:
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
index 567b593e05b..2c3107eae77 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py
@@ -1,3 +1,7 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
 import numpy as np
 import pytest
 import torch
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
new file mode 100644
index 00000000000..6571ef8773e
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_addmm_converter.py
@@ -0,0 +1,89 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import kgb
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    graph_contains_any_of_ops,
+)
+from executorch.backends.nxp.tests.models import AddmmModule, LinearModule
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
+
+
+class TestAddmmConversion(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
+
+    def test_addmm_conversion(self):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            input_shape = (1, 32)
+            model = AddmmModule(input_shape[1])
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=[exir_ops.edge.aten.addmm.default]
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
+
+    def test_linear_conversion__with_bias(self):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            input_shape = (10, 32)
+            model = LinearModule(bias=True)
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=[exir_ops.edge.aten.addmm.default]
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
index f5945607f1b..c02d184c5ae 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_clone_converter.py
@@ -4,31 +4,33 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import itertools
+import unittest
+
+import kgb
 import numpy as np
-import pytest
 import torch
 
 from executorch.backends.nxp.backend.edge_program_converter import (
     EdgeProgramToIRConverter,
 )
-from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executorch_pipeline import (
+    to_edge_program,
+    to_quantized_edge_program,
+)
 from executorch.backends.nxp.tests.executors import (
     convert_run_compare,
+    graph_contains_any,
     graph_contains_any_of_ops,
-    ToNCHWPreprocess,
-    ToNHWCPreprocess,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
+from parameterized import parameterized
 from torch import nn
 from torch.export import ExportedProgram
 
 
-@pytest.fixture(autouse=True)
-def reseed_model_per_test_run():
-    torch.manual_seed(23)
-    np.random.seed(23)
-
-
 class SingleConvBlockWithDropout(torch.nn.Module):
     def __init__(
         self, conv_in_channels: int = 3, perform_inplace_dropout: bool = False
@@ -74,57 +76,108 @@ def forward(self, x):
         return self.block(x)
 
 
-@pytest.mark.parametrize("inplace_dropout", [False, True])
-@pytest.mark.parametrize("input_shape", [(1, 3, 128, 128), (1, 3, 256, 256)])
-def test_conv_dropout_quant(mocker, inplace_dropout: bool, input_shape: tuple[int]):
-    model = SingleConvBlockWithDropout(
-        conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
-    ).eval()
+class TestCloneConverter(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(23)
 
-    quantized_program = to_quantized_edge_program(model, input_shape).exported_program()
+    @staticmethod
+    def _node_is_clone(node) -> bool:
+        clone_ops = [
+            exir_ops.edge.aten.clone.default,
+            exir_ops.edge.dim_order_ops._clone_dim_order.default,
+        ]
 
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
-
-    assert not graph_contains_any_of_ops(
-        graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default]
-    )
-
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToNHWCPreprocess(),
-        tflite_output_preprocess=ToNCHWPreprocess(),
-        input_data=input_data,
-        atol=1.0,
-    )
+        def target_can_be_clone(node):
+            if hasattr(node, "op") and node.op == "call_function":
+                return "clone" in node.target.__name__
 
+            return False
 
-@pytest.mark.parametrize("inplace_dropout", [False, True])
-def test_clone_pool_view_copy_quant(
-    mocker, inplace_dropout: bool, input_shape: tuple[int] = (1, 64, 25, 5)
-):
-    model = KWSFinalBlock(input_shape).eval()
+        return node in clone_ops or target_can_be_clone(node)
 
-    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
-
-    quantized_program = to_quantized_edge_program(model, input_shape).exported_program()
-
-    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
-    exported_program: ExportedProgram = converter_spy.call_args.args[1]
-
-    assert not graph_contains_any_of_ops(
-        graph=quantized_program.graph, ops=[exir_ops.edge.aten.clone.default]
+    @parameterized.expand(
+        list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)]))
     )
-
-    input_data = (np.random.random(input_shape) * 50).astype(np.int8)
-    convert_run_compare(
-        exported_program,
-        tfl_model=tflite_flatbuffers_model,
-        tflite_input_preprocess=ToNHWCPreprocess(),
-        input_data=input_data,
-        atol=1.0,
+    def test_conv_dropout_quant(self, inplace_dropout: bool, input_shape: tuple[int]):
+        model = SingleConvBlockWithDropout(
+            conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
+        ).eval()
+
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            quantized_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            assert not graph_contains_any(
+                graph=quantized_program.graph,
+                condition=TestCloneConverter._node_is_clone,
+            )
+
+            input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+            convert_run_compare(
+                exported_program,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
+
+    @parameterized.expand(
+        list(itertools.product([True, False], [(1, 3, 128, 128), (1, 3, 256, 256)]))
     )
+    def test_conv_dropout_no_quant(
+        self, inplace_dropout: bool, input_shape: tuple[int]
+    ):
+        model = SingleConvBlockWithDropout(
+            conv_in_channels=input_shape[1], perform_inplace_dropout=inplace_dropout
+        ).eval()
+
+        edge_program = to_edge_program(model, input_shape).exported_program()
+
+        has_clone = graph_contains_any_of_ops(
+            graph=edge_program.graph,
+            ops=[
+                exir_ops.edge.aten.clone.default,
+                exir_ops.edge.dim_order_ops._clone_dim_order.default,
+            ],
+        )
+
+        # Clone with inplace=True should not produce clone edge op and vice versa
+        assert inplace_dropout ^ has_clone
+
+    def test_clone_pool_view_copy_quant(self, input_shape: tuple[int] = (1, 64, 25, 5)):
+        model = KWSFinalBlock(input_shape).eval()
+
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            quantized_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            tflite_flatbuffers_model, _ = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            assert not graph_contains_any(
+                graph=quantized_program.graph,
+                condition=TestCloneConverter._node_is_clone,
+            )
+
+            input_data = (np.random.random(input_shape) * 50).astype(np.int8)
+            convert_run_compare(
+                exported_program,
+                tfl_model=tflite_flatbuffers_model,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
index 745b26ef8ff..d7a59cad6d6 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_conv_converter.py
@@ -76,7 +76,18 @@ def test_conv1d_quant_conversion(stride, dilation, kernel_size, mocker):
 
 @pytest.mark.parametrize("stride", [1, 2])
 @pytest.mark.parametrize("dilation", [2, 1])
-@pytest.mark.parametrize("kernel_size", [(1,), (3,)])
+@pytest.mark.parametrize(
+    "kernel_size",
+    [
+        pytest.param(
+            (1,),
+            marks=pytest.mark.xfail(
+                reason="Regression in Neutron SW 2.1.x (AIR-13336)", strict=True
+            ),
+        ),
+        (3,),
+    ],
+)
 @pytest.mark.parametrize("padding", [(1,), 2])
 def test_conv1d_quant_conversion__padded(
     stride, dilation, kernel_size, padding, mocker
@@ -179,7 +190,18 @@ def test_conv1d_quant_conversion__depthwise(stride, dilation, kernel_size, mocke
 
 @pytest.mark.parametrize("stride", [1, 2])
 @pytest.mark.parametrize("dilation", [2, 1])
-@pytest.mark.parametrize("kernel_size", [(1,), (3,)])
+@pytest.mark.parametrize(
+    "kernel_size",
+    [
+        pytest.param(
+            (1,),
+            marks=pytest.mark.xfail(
+                reason="Regression in Neutron SW 2.1.x (AIR-13336)", strict=True
+            ),
+        ),
+        (3,),
+    ],
+)
 @pytest.mark.parametrize("padding", [(1,), 2])
 def test_conv1d_quant_conversion__depthwise__padded(
     stride, dilation, kernel_size, padding, mocker
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
index e17868d16e2..c4bc559817b 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_hardtanh_converter.py
@@ -57,7 +57,7 @@ def test_relu6_quant(mocker, input_shape: tuple[int], inplace: bool):
         tflite_input_preprocess=ToNHWCPreprocess(),
         tflite_output_preprocess=ToNCHWPreprocess(),
         input_data=input_data,
-        atol=1.0,
+        atol=2.0,
     )
 
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py
deleted file mode 100644
index 858724522cd..00000000000
--- a/backends/nxp/tests/ir/converter/node_converter/test_linear_converter.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2024 NXP
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import pytest
-import torch
-
-from executorch.backends.nxp.tests.executorch_pipeline import to_edge_program
-from executorch.backends.nxp.tests.executors import convert_run_compare
-from executorch.backends.nxp.tests.models import LinearModule
-from executorch.exir.dialects._ops import ops as exir_ops
-
-
-@pytest.fixture(autouse=True)
-def reseed_model_per_test_run():
-    torch.manual_seed(23)
-    np.random.seed(23)
-
-
-def test_linear_conversion__with_bias():
-    input_shape = (10, 32)
-    edge_program = to_edge_program(
-        LinearModule(bias=True), input_shape
-    ).exported_program()
-
-    input_data = np.random.random(input_shape).astype(np.float32)
-
-    nodes = list(edge_program.graph.nodes)
-    assert nodes[4].target == exir_ops.edge.aten.addmm.default
-    assert len(nodes[4].args) == 3  # Has bias.
-
-    convert_run_compare(edge_program, input_data=input_data)
-
-
-def test_linear_conversion__without_bias():
-    input_shape = (10, 32)
-    edge_program = to_edge_program(
-        LinearModule(bias=False), input_shape
-    ).exported_program()
-
-    input_data = np.random.random(input_shape).astype(np.float32)
-
-    nodes = list(edge_program.graph.nodes)
-    assert nodes[3].target == exir_ops.edge.aten.mm.default
-    assert len(nodes[3].args) == 2  # No bias.
-
-    convert_run_compare(edge_program, input_data=input_data)
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
index 0032eae5c1a..a634416f8a7 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py
@@ -49,6 +49,7 @@ def test_mean_dim_conv_quant_conversion(mocker, input_shape, dim, keeepdim=True)
         input_data=input_data,
         tflite_output_preprocess=ToChannelFirstPreprocess(),
         tfl_model=tflite_flatbuffers_model,
+        atol=1.0,
     )
 
 
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
new file mode 100644
index 00000000000..609c0f6c78c
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_mm_converter.py
@@ -0,0 +1,89 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import kgb
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    graph_contains_any_of_ops,
+)
+from executorch.backends.nxp.tests.models import LinearModule, MmModule
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
+
+
+class TestMmConversion(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
+
+    def test_mm_conversion(self):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            input_shape = (1, 32)
+            model = MmModule(input_shape[1])
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=[exir_ops.edge.aten.mm.default]
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
+
+    def test_linear_conversion__without_bias(self):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            input_shape = (10, 32)
+            model = LinearModule(bias=False)
+
+            edge_program = to_quantized_edge_program(
+                model, input_shape
+            ).exported_program()
+
+            # Make sure that all nodes were delegated.
+            assert not graph_contains_any_of_ops(
+                graph=edge_program.graph, ops=[exir_ops.edge.aten.mm.default]
+            )
+            assert any(
+                "lowered_module" in node.name for node in edge_program.graph.nodes
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+            convert_run_compare(
+                exported_program,
+                input_data,
+                tfl_model=tflite_flatbuffers_model,
+            )
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
new file mode 100644
index 00000000000..98566ff1ad6
--- /dev/null
+++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py
@@ -0,0 +1,175 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import pytest
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import (
+    SubTensorConvModule,
+    SubTensorModule,
+    SubTensorOneInputModule,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
+
+
+@pytest.fixture(autouse=True)
+def reseed_model_per_test_run():
+    torch.manual_seed(23)
+    np.random.seed(23)
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        pytest.param((4,), id="1D."),
+        pytest.param((6, 6), id="2D."),
+        pytest.param((1, 4, 8), id="3D."),
+        pytest.param((1, 4, 8, 8), id="4D."),
+    ],
+)
+def test_sub_tensor_quant_conversion(mocker, input_shape):
+    model = SubTensorModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, [input_shape, input_shape])
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data_1 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data_2 = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data = {0: input_data_1, 1: input_data_2}
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[4].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+    )
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        pytest.param((4,), id="1D."),
+        pytest.param((6, 6), id="2D."),
+        pytest.param((1, 4, 8), id="3D."),
+        pytest.param((1, 4, 8, 8), id="4D."),
+    ],
+)
+def test_sub_tensor_one_input_quant_conversion(mocker, input_shape):
+    model = SubTensorOneInputModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, input_shape)
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8)
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[2].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data
+    )
+
+
+@pytest.mark.parametrize(
+    "x_input_shape",
+    [
+        pytest.param((1, 4, 8, 8), id="4D."),
+        pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."),
+    ],
+)
+def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape):
+    model = SubTensorConvModule()
+
+    converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program")
+
+    n, c, h, w = x_input_shape
+    y_input_shape = (n, 8, h, w)
+
+    # Run conversion
+    _ = to_quantized_edge_program(model, [x_input_shape, y_input_shape])
+
+    # Capture generated model
+    tflite_flatbuffers_model, io_formats = converter_spy.spy_return
+
+    # Capture converted program
+    exported_program: ExportedProgram = converter_spy.call_args.args[1]
+
+    input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype(
+        np.int8
+    )
+    input_data = {0: input_data_1, 1: input_data_2}
+
+    nodes = list(exported_program.graph.nodes)
+    assert nodes[15].target == exir_ops.edge.aten.sub.Tensor
+
+    convert_run_compare(
+        exported_program,
+        input_data=input_data,
+        tflite_input_preprocess=ToChannelLastPreprocess(),
+        tfl_model=tflite_flatbuffers_model,
+        tflite_output_preprocess=ToChannelFirstPreprocess(),
+    )
+
+
+@pytest.mark.parametrize(
+    "x_input_shape, y_input_shape",
+    [
+        pytest.param((1, 4, 7), (4, 7), id="3D -> 2D."),
+        pytest.param((1, 4, 8), (1, 4, 4, 8), id="3D -> 4D."),
+        pytest.param((1, 1, 4, 4, 8), (1, 4, 4, 8), id="5D -> 4D."),
+        pytest.param((4,), (4, 4), id="1D -> 2D."),
+        pytest.param((4,), (4, 4, 4), id="1D -> 3D."),
+        pytest.param((6, 6), (1, 8, 6, 6), id="2D -> 4D."),
+        pytest.param((6, 6), (6,), id="2D -> 1D."),
+    ],
+)
+def test_sub_tensor_broadcasting_unsupported_quant_conversion(
+    x_input_shape, y_input_shape
+):
+    model = SubTensorModule()
+
+    # Run conversion
+    edge_program = to_quantized_edge_program(
+        model, [x_input_shape, y_input_shape]
+    ).exported_program()
+    nodes = list(edge_program.graph.nodes)
+
+    # Broadcast is not supported, node is not converted
+    assert (
+        nodes[6].target == exir_ops.edge.aten.sub.Tensor
+    )  # Sub Tensor is not delegated.
diff --git a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
index 40857d18eb8..bb4500bc1e2 100644
--- a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
+++ b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py
@@ -27,6 +27,11 @@
 class TestTanhConverter(unittest.TestCase):
     __test__ = False  # Prevent interfering with PyTest tests
 
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(23)
+
     @parameterized.expand(
         input=[
             (
@@ -76,10 +81,5 @@ def test_conv_tanh(
                 tflite_input_preprocess=ToChannelLastPreprocess(),
                 tflite_output_preprocess=ToChannelFirstPreprocess(),
                 input_data=input_data,
-                atol=1.0,
+                atol=2.0,
             )
-
-    @classmethod
-    def setUpClass(cls):
-        torch.manual_seed(23)
-        np.random.seed(23)
diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py
index bdad9ddc4b4..f613349fed0 100644
--- a/backends/nxp/tests/models.py
+++ b/backends/nxp/tests/models.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
 from typing import Callable, Collection, Union
 
 import torch
@@ -169,6 +170,32 @@ def forward(self, x):
         return self.linear(x)
 
 
+class AddmmModule(torch.nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(in_channels, in_channels))
+        self.bias = torch.nn.Parameter(torch.empty(in_channels))
+        torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
+        bound = 1 / math.sqrt(fan_in)
+        torch.nn.init.uniform_(self.bias, -bound, bound)
+        self.eval()
+
+    def forward(self, x):
+        return torch.addmm(self.bias, x, self.weight)
+
+
+class MmModule(torch.nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(in_channels, in_channels))
+        torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        self.eval()
+
+    def forward(self, x):
+        return torch.mm(x, self.weight)
+
+
 class LinearSoftmaxModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -424,6 +451,34 @@ def forward(x):
         return x + x
 
 
+class SubTensorModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def forward(x, y):
+        return x - y
+
+
+class SubTensorConvModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = Conv2dModule(padding=1, stride=1)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        return x - y
+
+
+class SubTensorOneInputModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def forward(x):
+        return x - x
+
+
 class MeanDimLinearModule(torch.nn.Module):
     def __init__(self, dim, keepdim):
         super().__init__()
diff --git a/backends/nxp/tests/test_batch_norm_fusion.py b/backends/nxp/tests/test_batch_norm_fusion.py
index 3f1106c6d24..788d04c6dad 100644
--- a/backends/nxp/tests/test_batch_norm_fusion.py
+++ b/backends/nxp/tests/test_batch_norm_fusion.py
@@ -168,7 +168,7 @@ def test_batch_norm_conv_fusing__full_pipeline__1d(bias: bool):
     nodes = list(edge_program.graph.nodes)
 
     assert (
-        len(nodes) == 13
+        len(nodes) == 17
     )  # 1D Conv currently isn't delegated, because it doesn't get quantized.
     assert not any(
         node.op == "call_function" and "batch_norm" in node.target.__name__
diff --git a/backends/nxp/tests/test_context_sensitive_delegation.py b/backends/nxp/tests/test_context_sensitive_delegation.py
new file mode 100644
index 00000000000..1919bc63d82
--- /dev/null
+++ b/backends/nxp/tests/test_context_sensitive_delegation.py
@@ -0,0 +1,71 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.ir.converter.node_converters.ops_converters import (
+    ViewCopyConverter,
+)
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class SingleViewCopyModule(torch.nn.Module):
+    def __init__(self, new_shape: list[int]):
+        super().__init__()
+        self.new_shape = new_shape
+
+    def forward(self, x):
+        return torch.reshape(x, self.new_shape)
+
+
+class TestContextSensitiveDelegation(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests.
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
+
+    def test_single_view_copy_partition(self):
+        input_shape = (2, 10)
+        module = SingleViewCopyModule([1, 20])
+
+        ep = to_quantized_edge_program(module, input_shape).exported_program()
+
+        # Make sure the `view_copy` was not delegated.
+        assert graph_contains_any_of_ops(
+            ep.graph, [exir_ops.edge.aten.view_copy.default]
+        )
+        assert not any("delegate" in n.name for n in ep.graph.nodes)
+
+    def test_single_view_copy_partition__forced_delegation(self):
+        input_shape = (2, 10)
+        module = SingleViewCopyModule([1, 20])
+
+        def _supported_partitioning(*_):
+            return True
+
+        # Replace the partition support check function, to accept anything.
+        original_supports_partitioning_result = (
+            ViewCopyConverter.supports_partitioning_result
+        )
+        ViewCopyConverter.supports_partitioning_result = _supported_partitioning
+
+        with self.assertRaises(RuntimeError) as e:
+            to_quantized_edge_program(module, input_shape).exported_program()
+        assert (
+            str(e.exception)
+            == "Model converted with neutron-converter does not contain a NeutronGraph node."
+        )
+
+        # Return to the original partition support check function.
+        ViewCopyConverter.supports_partitioning_result = (
+            original_supports_partitioning_result
+        )
diff --git a/backends/nxp/tests/test_linear_and_add_fusion.py b/backends/nxp/tests/test_linear_and_add_fusion.py
new file mode 100644
index 00000000000..16d3c4140a2
--- /dev/null
+++ b/backends/nxp/tests/test_linear_and_add_fusion.py
@@ -0,0 +1,644 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from copy import deepcopy
+
+import numpy as np
+import torch
+
+from executorch.backends.nxp.aten_passes.fuse_linear_and_add_pass import (
+    FuseLinearAndAddPass,
+)
+from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import (
+    NeutronAtenPassManager,
+)
+from executorch.backends.nxp.aten_passes.remove_nodes_with_known_outputs import (
+    RemoveNodesWithKnownOutputs,
+)
+from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
+from parameterized import parameterized
+
+
+class LinearAddModule(torch.nn.Module):
+    def __init__(
+        self,
+        fc_in_features: int,
+        fc_out_features: int,
+        bias: bool,
+        artificial_bias_shape: list[int],
+        alpha=1.0,
+    ):
+        super().__init__()
+        self.fc_in_features = fc_in_features
+        self.fc_out_features = fc_out_features
+        self.bias = bias
+        self.artificial_bias_shape = artificial_bias_shape
+        self.alpha = alpha
+        self.linear = torch.nn.Linear(fc_in_features, fc_out_features, bias=bias)
+        self.eval()
+
+    def forward(self, x):
+        artificial_bias = torch.ones(self.artificial_bias_shape, dtype=torch.float32)
+        x = self.linear(x)
+        return torch.add(x, artificial_bias, alpha=self.alpha)
+
+
+class LinearAddModuleReverseNodeOrder(torch.nn.Module):
+    """The `ones` added by the `add` are only generated after the `linear` node."""
+
+    def __init__(
+        self,
+        fc_in_features: int,
+        fc_out_features: int,
+        bias: bool,
+        artificial_bias_shape: list[int],
+    ):
+        super().__init__()
+        self.fc_in_features = fc_in_features
+        self.fc_out_features = fc_out_features
+        self.bias = bias
+        self.artificial_bias_shape = artificial_bias_shape
+        self.linear = torch.nn.Linear(fc_in_features, fc_out_features, bias=bias)
+        self.eval()
+
+    def forward(self, x):
+        # The `ones` are generated after the `linear` call.
+        x = self.linear(x)
+        artificial_bias = torch.ones(self.artificial_bias_shape, dtype=torch.float32)
+        return torch.add(x, artificial_bias)
+
+
+class LinearAddModuleReverseInputOrder(torch.nn.Module):
+    """The `add` has the output of the `linear` as its second input (which is the input multiplied by `alpha`)."""
+
+    def __init__(
+        self,
+        fc_in_features: int,
+        fc_out_features: int,
+        bias: bool,
+        artificial_bias_shape: list[int],
+        alpha=1.0,
+    ):
+        super().__init__()
+        self.fc_in_features = fc_in_features
+        self.fc_out_features = fc_out_features
+        self.bias = bias
+        self.artificial_bias_shape = artificial_bias_shape
+        self.alpha = alpha
+        self.linear = torch.nn.Linear(fc_in_features, fc_out_features, bias=bias)
+        self.eval()
+
+    def forward(self, x):
+        artificial_bias = torch.ones(self.artificial_bias_shape, dtype=torch.float32)
+        x = self.linear(x)
+        return torch.add(artificial_bias, x, alpha=self.alpha)  # Reversed input order.
+
+
+class TestLinearAndAddFusing(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests.
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(42)
+
+    @parameterized.expand(
+        [
+            ["2D", [4, 6]],
+            ["4D", [4, 6, 8, 10]],
+        ]
+    )
+    def test_linear_add_fusing__static__no_bias__valid_shape(
+        self, _, input_shape: list[int]
+    ):
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, False, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # The `add` has been removed.
+        assert len(modified_nodes) == 5
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[3].args) == 3
+        assert "ones" in modified_nodes[3].args[2].name
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.add.Tensor]
+        )
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    @parameterized.expand(
+        [
+            ["2D", [8, 10]],
+        ]
+    )
+    def test_linear_add_fusing__static__no_bias__invalid_shape(
+        self, _, input_shape: list[int]
+    ):
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(
+            input_shape[-1], 5, False, [8, 5]  # Unsupported `linear` bias shape.
+        )
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert len(original_nodes[3].args) == 2
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Nothing changed.
+        assert len(modified_nodes) == 6
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert modified_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    @parameterized.expand(
+        [
+            ["2D", [4, 6]],
+            ["4D", [2, 3, 4, 5]],
+        ]
+    )
+    def test_linear_add_fusing__static__bias__valid_shape(
+        self, _, input_shape: list[int]
+    ):
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, True, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 7
+        assert original_nodes[3].target == torch.ops.aten.ones.default
+        assert original_nodes[4].target == torch.ops.aten.linear.default
+        assert len(original_nodes[4].args) == 3
+        assert original_nodes[5].target == torch.ops.aten.add.Tensor
+
+        # make sure the `add` and the `ones` were removed.
+        assert len(modified_nodes) == 5
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.ones.default]
+        )
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[3].args) == 3
+        assert "combined" in modified_nodes[3].args[2].name
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.add.Tensor]
+        )
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__static__no_bias__reverse_order(self):
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        # Use a module where the `bias` is generated after the `linear` node, which prevents the change.
+        module = LinearAddModuleReverseNodeOrder(input_shape[-1], 5, False, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[2].target == torch.ops.aten.linear.default
+        assert len(original_nodes[2].args) == 2
+        assert (
+            original_nodes[3].target == torch.ops.aten.ones.default
+        )  # `ones` after `linear`.
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # The `add` has been removed.
+        assert len(modified_nodes) == 5
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[3].args) == 3
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.add.Tensor]
+        )
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__static__bias__reverse_order(self):
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        # Use a module where the `bias` is generated after the `linear` node, which prevents the change.
+        module = LinearAddModuleReverseNodeOrder(input_shape[-1], 5, True, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 7
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert len(original_nodes[3].args) == 3
+        assert (
+            original_nodes[4].target == torch.ops.aten.ones.default
+        )  # `ones` after `linear`.
+        assert original_nodes[5].target == torch.ops.aten.add.Tensor
+
+        # The `add` and `ones` have been removed.
+        assert len(modified_nodes) == 5
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.ones.default]
+        )
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[3].args) == 3
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.add.Tensor]
+        )
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__static__alpha__no_bias(self):
+        alpha = 2.34
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, False, [5], alpha=alpha)
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[2].target == torch.ops.aten.ones.default
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert len(original_nodes[3].args) == 2
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+        assert original_nodes[4].kwargs["alpha"] == alpha
+
+        # The `add` has been removed.
+        assert len(modified_nodes) == 5
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[3].args) == 3
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.add.Tensor]
+        )
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__static__alpha__bias(self):
+        alpha = 2.34
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, True, [5], alpha=alpha)
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 7
+        assert original_nodes[3].target == torch.ops.aten.ones.default
+        assert original_nodes[4].target == torch.ops.aten.linear.default
+        assert len(original_nodes[4].args) == 3
+        assert original_nodes[5].target == torch.ops.aten.add.Tensor
+        assert original_nodes[5].kwargs["alpha"] == alpha
+
+        # The `add` has been removed.
+        assert len(modified_nodes) == 5
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[3].args) == 3
+        assert not graph_contains_any_of_ops(
+            modified_module.graph, [torch.ops.aten.add.Tensor]
+        )
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__static__alpha__reversed_add_inputs(self):
+        alpha = 2.34
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModuleReverseInputOrder(
+            input_shape[-1], 5, True, [5], alpha=alpha
+        )
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager(
+            [
+                RemoveNodesWithKnownOutputs(),  # Make the added tensor static.
+                FuseLinearAndAddPass(),
+            ]
+        )(deepcopy(program.module())).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 7
+        assert original_nodes[3].target == torch.ops.aten.ones.default
+        assert original_nodes[4].target == torch.ops.aten.linear.default
+        assert len(original_nodes[4].args) == 3
+        assert original_nodes[5].target == torch.ops.aten.add.Tensor
+        assert (
+            original_nodes[5].args[1] == original_nodes[4]
+        )  # `linear` is the second input.
+        assert original_nodes[5].kwargs["alpha"] == alpha
+
+        # Nothing changed (except the `ones` was replaced by static data).
+        assert len(modified_nodes) == 7
+        assert modified_nodes[4].target == torch.ops.aten.linear.default
+        assert len(modified_nodes[4].args) == 3
+        assert modified_nodes[5].target == torch.ops.aten.add.Tensor
+        assert (
+            modified_nodes[5].args[1] == modified_nodes[4]
+        )  # `linear` is the second input.
+        assert modified_nodes[5].kwargs["alpha"] == alpha
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    @parameterized.expand(
+        [
+            ["2D", [4, 6]],
+        ]
+    )
+    def test_linear_add_fusing__dynamic__no_bias__valid_shape(
+        self, _, input_shape: list[int]
+    ):
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, False, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
+            deepcopy(program.module())
+        ).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Nothing changed.
+        assert len(modified_nodes) == 6
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert modified_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    @parameterized.expand(
+        [
+            ["2D", [8, 10]],
+        ]
+    )
+    def test_linear_add_fusing__dynamic__no_bias__invalid_shape(
+        self, _, input_shape: list[int]
+    ):
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(
+            input_shape[-1], 5, False, [8, 5]  # Unsupported `linear` bias shape.
+        )
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
+            deepcopy(program.module())
+        ).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Nothing changed.
+        assert len(modified_nodes) == 6
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert modified_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    @parameterized.expand(
+        [
+            ["2D", [4, 6]],
+        ]
+    )
+    def test_linear_add_fusing__dynamic__bias__valid_shape(
+        self, _, input_shape: list[int]
+    ):
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, True, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
+            deepcopy(program.module())
+        ).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 7
+        assert original_nodes[3].target == torch.ops.aten.ones.default
+        assert original_nodes[4].target == torch.ops.aten.linear.default
+        assert original_nodes[5].target == torch.ops.aten.add.Tensor
+
+        # Nothing has changed, as the second bias is dynamic, so it cannot be added together with the first bias.
+        assert len(modified_nodes) == 7
+        assert modified_nodes[3].target == torch.ops.aten.ones.default
+        assert modified_nodes[4].target == torch.ops.aten.linear.default
+        assert modified_nodes[5].target == torch.ops.aten.add.Tensor
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__dynamic__reverse_order(self):
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        # Use a module where the `bias` is generated after the `linear` node, which prevents the change.
+        module = LinearAddModuleReverseNodeOrder(input_shape[-1], 5, False, [5])
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
+            deepcopy(program.module())
+        ).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[2].target == torch.ops.aten.linear.default
+        assert original_nodes[3].target == torch.ops.aten.ones.default
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Nothing has changed.
+        assert len(modified_nodes) == 6
+        assert modified_nodes[2].target == torch.ops.aten.linear.default
+        assert modified_nodes[3].target == torch.ops.aten.ones.default
+        assert modified_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
+
+    def test_linear_add_fusing__dynamic__alpha(self):
+        alpha = 2.34
+        input_shape = [4, 8]
+        example_input = (torch.ones(input_shape),)
+
+        module = LinearAddModule(input_shape[-1], 5, False, [5], alpha=alpha)
+        program = torch.export.export(module, example_input, strict=True)
+        original_module = program.module()
+
+        modified_module = NeutronAtenPassManager([FuseLinearAndAddPass()])(
+            deepcopy(program.module())
+        ).graph_module
+
+        # Make sure the module wasn't broken.
+        original_nodes = list(original_module.graph.nodes)
+        modified_nodes = list(modified_module.graph.nodes)
+
+        assert len(original_nodes) == 6
+        assert original_nodes[2].target == torch.ops.aten.ones.default
+        assert original_nodes[3].target == torch.ops.aten.linear.default
+        assert original_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Nothing has changed.
+        assert len(modified_nodes) == 6
+        assert modified_nodes[2].target == torch.ops.aten.ones.default
+        assert modified_nodes[3].target == torch.ops.aten.linear.default
+        assert modified_nodes[4].target == torch.ops.aten.add.Tensor
+
+        # Verify that the behavior has not changed.
+        input_data = torch.randn(input_shape, dtype=torch.float32)
+        out1 = original_module(input_data).detach().numpy()
+        out2 = modified_module(input_data).detach().numpy()
+        assert np.allclose(out1, out2)
diff --git a/backends/nxp/tests/test_neutron_backend.py b/backends/nxp/tests/test_neutron_backend.py
index 53e54ec2f56..c9917651fbd 100644
--- a/backends/nxp/tests/test_neutron_backend.py
+++ b/backends/nxp/tests/test_neutron_backend.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py
index af723ec9c7a..2fcfd8cd987 100644
--- a/backends/nxp/tests/test_neutron_converter_manager.py
+++ b/backends/nxp/tests/test_neutron_converter_manager.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -29,9 +29,7 @@ def test_conv2d_neutron_conversion__default_flavor():
     )
 
     neutron_converter_manager = NeutronConverterManager()
-    neutron_model = neutron_converter_manager.convert(
-        tflite_model, "imxrt700", "SDK_25_06"
-    )
+    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700")
 
     assert len(
         neutron_model
@@ -50,9 +48,8 @@ def test__conv2d_neutron_conversion__invalid_flavor():
         edge_program_manager.exported_program()
     )
 
-    neutron_converter_manager = NeutronConverterManager()
     with pytest.raises(RuntimeError) as excinfo:
-        _ = neutron_converter_manager.convert(tflite_model, "imxrt700", "bad_flavor")
+        _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700")
 
     assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
         excinfo
diff --git a/backends/nxp/tests/test_per_channel_conversion.py b/backends/nxp/tests/test_per_channel_conversion.py
new file mode 100644
index 00000000000..043ba8fc001
--- /dev/null
+++ b/backends/nxp/tests/test_per_channel_conversion.py
@@ -0,0 +1,153 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import kgb
+import numpy as np
+import torch
+
+from executorch.backends.nxp.backend.edge_program_converter import (
+    EdgeProgramToIRConverter,
+)
+from executorch.backends.nxp.quantizer.neutron_quantizer import (
+    act_qspec,
+    NeutronAtenQuantizer,
+    wgt_qspec,
+)
+from executorch.backends.nxp.quantizer.patterns import (
+    NodeArgsIdx,
+    PartitionAnchors,
+    QuantizationPattern,
+)
+from executorch.backends.nxp.quantizer.utils import get_bias_qparams
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.executors import (
+    convert_run_compare,
+    ToChannelFirstPreprocess,
+    ToChannelLastPreprocess,
+)
+from executorch.backends.nxp.tests.models import Conv2dModule
+from executorch.backends.nxp.tests.test_quantizer import _get_target_name
+
+from torch import fx
+from torch._ops import OpOverload
+from torch.export import ExportedProgram
+from torchao.quantization.pt2e import MinMaxObserver, PerChannelMinMaxObserver
+from torchao.quantization.pt2e.quantizer import (
+    DerivedQuantizationSpec,
+    QuantizationConfig,
+    QuantizationSpec,
+)
+
+
+class Conv2dPatternPerChannel(QuantizationPattern):
+
+    def __init__(self, is_per_channel: bool):
+        super().__init__()
+        self.is_per_channel = is_per_channel
+
+    def partition_types(self) -> list[OpOverload]:
+        return [torch.ops.aten.conv2d.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: list[fx.GraphModule]
+    ) -> PartitionAnchors:
+        conv2d_node = fused_partition[0].nodes[-1]
+
+        bias_qscheme = (
+            torch.per_channel_symmetric
+            if self.is_per_channel
+            else torch.per_tensor_symmetric
+        )
+        bias_quantization_qspec = DerivedQuantizationSpec(
+            derived_from=[
+                (conv2d_node.args[0], conv2d_node),
+                (conv2d_node.args[1], conv2d_node),
+            ],
+            derive_qparams_fn=get_bias_qparams,
+            dtype=torch.int32,
+            quant_min=-(2**31) + 1,
+            quant_max=2**31 - 1,
+            qscheme=bias_qscheme,
+            ch_axis=0,
+        )
+
+        weight_qscheme = (
+            torch.per_channel_symmetric
+            if self.is_per_channel
+            else torch.per_tensor_symmetric
+        )
+        weight_observer_or_fake_quant_ctr = (
+            PerChannelMinMaxObserver if self.is_per_channel else MinMaxObserver
+        )
+        weight_quantization_spec = QuantizationSpec(
+            dtype=torch.int8,
+            observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr,
+            quant_min=-127,
+            quant_max=127,
+            qscheme=weight_qscheme,
+            ch_axis=0,
+        )
+
+        return PartitionAnchors(
+            inputs=[(conv2d_node, NodeArgsIdx(0))],
+            weights=[(conv2d_node, NodeArgsIdx(1), weight_quantization_spec)],
+            biases=[(conv2d_node, NodeArgsIdx(2), bias_quantization_qspec)],
+            output=[(conv2d_node,)],
+        )
+
+
+class TestPerChannelConversion(unittest.TestCase):
+    __test__ = False  # Prevent interfering with PyTest tests
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(25)
+        np.random.seed(25)
+
+    def test_per_channel_convolution(self):
+        with kgb.spy_on(
+            EdgeProgramToIRConverter.convert_program, call_original=True
+        ) as converter_spy:
+            model = Conv2dModule(
+                in_channels=8, out_channels=32, kernel_size=5, padding=3
+            )
+            input_shape = (1, 8, 32, 32)
+
+            static_qconfig = QuantizationConfig(act_qspec, act_qspec, wgt_qspec, None)
+            _ = to_quantized_edge_program(
+                model,
+                input_shape,
+                get_quantizer_fn=lambda: NeutronAtenQuantizer(
+                    Conv2dPatternPerChannel(is_per_channel=True), static_qconfig
+                ),
+            )
+
+            tflite_flatbuffers_model, io_formats = converter_spy.calls[-1].return_value
+            exported_program: ExportedProgram = converter_spy.calls[-1].args[0]
+
+            input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(
+                np.int8
+            )
+
+            convert_run_compare(
+                exported_program,
+                tflite_input_preprocess=ToChannelLastPreprocess(),
+                tfl_model=tflite_flatbuffers_model,
+                tflite_output_preprocess=ToChannelFirstPreprocess(),
+                input_data=input_data,
+                atol=1.0,
+            )
+
+            nodes = list(exported_program.graph.nodes)
+
+            assert _get_target_name(nodes[8]).endswith(
+                "quantized_decomposed.dequantize_per_channel.default"
+            )
+            assert _get_target_name(nodes[9]).endswith(
+                "quantized_decomposed.dequantize_per_channel.default"
+            )
+            assert nodes[10].name == "aten_convolution_default"
diff --git a/backends/nxp/tests/test_qdq_clustering_conv.py b/backends/nxp/tests/test_qdq_clustering_conv.py
index 1713aace1fe..ffae931dbb4 100644
--- a/backends/nxp/tests/test_qdq_clustering_conv.py
+++ b/backends/nxp/tests/test_qdq_clustering_conv.py
@@ -16,13 +16,13 @@ def test_conv2d_partitioner():
     lowered_module = edge_program.exported_program().graph_module.lowered_module_0
     nodes = list(lowered_module.original_module.graph.nodes)
 
-    assert len(nodes) == 7
+    assert len(nodes) == 9
 
-    q_x_node = nodes[1]
-    dq_w_node = nodes[2]
-    dq_x_node = nodes[3]
-    conv_node = nodes[4]
-    q_y_node = nodes[5]
+    q_x_node = nodes[3]
+    dq_w_node = nodes[4]
+    dq_x_node = nodes[5]
+    conv_node = nodes[6]
+    q_y_node = nodes[7]
 
     assert "cluster" not in q_x_node.meta
     assert dq_w_node.meta["cluster"] == "aten_convolution_default_cluster"
diff --git a/backends/nxp/tests/test_quantizer.py b/backends/nxp/tests/test_quantizer.py
index ef5fbb0cbca..624e350ed21 100644
--- a/backends/nxp/tests/test_quantizer.py
+++ b/backends/nxp/tests/test_quantizer.py
@@ -1,4 +1,4 @@
-# Copyright 2024 NXP
+# Copyright 2024-2025 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -34,26 +34,26 @@ def test_quantizer_conv2d():
     m(*example_input)
 
     nodes = list(m.graph.nodes)
-    assert len(nodes) == 11
-    assert nodes[7].name == "conv2d"
+    assert len(nodes) == 15
+    assert nodes[11].name == "conv2d"
     # [0]: Input, [1] : weights, [2]: bias
     assert (
-        _get_target_name(nodes[7].args[0])
+        _get_target_name(nodes[11].args[0])
         == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
     )
     assert (
-        _get_target_name(nodes[7].args[1])
-        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+        _get_target_name(nodes[11].args[1])
+        == "torch.ops.quantized_decomposed.dequantize_per_channel.default"
     )
     assert (
-        _get_target_name(nodes[7].args[2])
-        == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
+        _get_target_name(nodes[11].args[2])
+        == "torch.ops.quantized_decomposed.dequantize_per_channel.default"
     )
     assert (
-        _get_target_name(nodes[8])
+        _get_target_name(nodes[12])
         == "torch.ops.quantized_decomposed.quantize_per_tensor.default"
     )
-    assert nodes[8].args[0].name == "conv2d"
+    assert nodes[12].args[0].name == "conv2d"
 
 
 def test_quantizer_linear():
@@ -112,22 +112,22 @@ def test_quantizer_maxpool2d():
     m(*example_input)
 
     nodes = list(m.graph.nodes)
-    assert len(nodes) == 14
+    assert len(nodes) == 18
     # Check if QDQ pattern:
-    assert nodes[10].name == "max_pool2d"
+    assert nodes[14].name == "max_pool2d"
     assert (
-        _get_target_name(nodes[10].args[0])
+        _get_target_name(nodes[14].args[0])
         == "torch.ops.quantized_decomposed.dequantize_per_tensor.default"
     )
     assert (
-        _get_target_name(nodes[11])
+        _get_target_name(nodes[15])
         == "torch.ops.quantized_decomposed.quantize_per_tensor.default"
     )
-    assert nodes[11].args[0].name == "max_pool2d"
+    assert nodes[15].args[0].name == "max_pool2d"
 
     # Check if input and output quantization is same
-    input_quant = nodes[10].args[0].args[1:]
-    output_quant = nodes[11].args[1:]
+    input_quant = nodes[14].args[0].args[1:]
+    output_quant = nodes[15].args[1:]
     assert input_quant == output_quant
 
 
@@ -207,10 +207,10 @@ def test_quantizer_conv2d_relu():
     m(*example_input)
 
     nodes = list(m.graph.nodes)
-    assert len(nodes) == 12
-    assert nodes[7].name == "dequantize_per_tensor_default_2"
-    assert nodes[8].name == "relu"
-    assert nodes[9].name == "quantize_per_tensor_default_3"
+    assert len(nodes) == 14
+    assert nodes[9].name == "dequantize_per_tensor_default_1"
+    assert nodes[10].name == "relu"
+    assert nodes[11].name == "quantize_per_tensor_default_2"
 
 
 def test_quantizer_conv2d_avg_pool2d():
@@ -230,10 +230,10 @@ def test_quantizer_conv2d_avg_pool2d():
     m(*example_input)
 
     nodes = list(m.graph.nodes)
-    assert len(nodes) == 14
-    assert nodes[9].name == "dequantize_per_tensor_default_3"
-    assert nodes[10].name == "avg_pool2d"
-    assert nodes[11].name == "quantize_per_tensor_default_4"
+    assert len(nodes) == 18
+    assert nodes[13].name == "dequantize_per_tensor_default_1"
+    assert nodes[14].name == "avg_pool2d"
+    assert nodes[15].name == "quantize_per_tensor_default_2"
 
 
 def test_quantizer_conv2d_permute():
@@ -253,10 +253,11 @@ def test_quantizer_conv2d_permute():
     m(*example_input)
 
     nodes = list(m.graph.nodes)
-    assert len(nodes) == 12
-    assert nodes[7].name == "dequantize_per_tensor_default_2"
-    assert nodes[8].name == "permute"
-    assert nodes[9].name == "quantize_per_tensor_default_3"
+
+    assert len(nodes) == 14
+    assert nodes[9].name == "dequantize_per_tensor_default_1"
+    assert nodes[10].name == "permute"
+    assert nodes[11].name == "quantize_per_tensor_default_2"
 
 
 def test_multiple_shared_spec_ops_in_row():
@@ -281,15 +282,15 @@ def test_multiple_shared_spec_ops_in_row():
 
     nodes = list(m.graph.nodes)
 
-    assert len(nodes) == 15
-    assert nodes[-5].name == "dequantize_per_tensor_default_3"
+    assert len(nodes) == 17
+    assert nodes[-5].name.startswith("dequantize_per_tensor_default")
     assert nodes[-4].name == "max_pool2d"
-    assert nodes[-3].name == "quantize_per_tensor_default_4"
+    assert nodes[-3].name.startswith("quantize_per_tensor_default")
 
     # Assert that post-ReLU quantize and pre-MaxPool dequantize has same specs
     assert nodes[-6].args[1:] == nodes[-5].args[1:]
     # Assert that post-Conv quantize and pre-ReLU dequantize has same specs
-    assert nodes[6].args[1:] == nodes[7].args[1:]
+    assert nodes[5].args[1:] == nodes[6].args[1:]
 
 
 def test_quantizers_order_invariance():
diff --git a/backends/nxp/tests/test_removing_dead_code.py b/backends/nxp/tests/test_removing_dead_code.py
index 7b8641fb247..cc51746c81c 100644
--- a/backends/nxp/tests/test_removing_dead_code.py
+++ b/backends/nxp/tests/test_removing_dead_code.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
 from executorch.backends.nxp.tests.executorch_pipeline import _quantize_model
 from executorch.backends.nxp.tests.executors import graph_contains_any_of_ops
 
@@ -32,6 +33,11 @@ def forward(self, x):
 class TestRemovingDeadCode(unittest.TestCase):
     __test__ = False  # Prevent interfering with PyTest tests
 
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(23)
+        np.random.seed(23)
+
     def test_removing_dead_code(self):
         input_shape = (42,)
         example_inputs = (torch.ones(input_shape),)
@@ -45,16 +51,12 @@ def test_removing_dead_code(self):
         )
 
         # The `NeutronQuantizer` should remove the dead code in the `transform_for_annotation()` method.
+        quantizer = NeutronQuantizer()
         exir_program_aten_quant = _quantize_model(
-            exir_program_aten.module(), [example_inputs]
+            exir_program_aten.module(), quantizer, [example_inputs]
         )
 
         # Make sure the is no `add` operation in the graph anymore.
         assert not any(
             "add" in str(node.target) for node in exir_program_aten_quant.graph.nodes
         )
-
-    @classmethod
-    def setUpClass(cls):
-        torch.manual_seed(23)
-        np.random.seed(23)
diff --git a/backends/nxp/tests/test_split_group_convolution.py b/backends/nxp/tests/test_split_group_convolution.py
index 1da53af794d..4c9f277e34d 100644
--- a/backends/nxp/tests/test_split_group_convolution.py
+++ b/backends/nxp/tests/test_split_group_convolution.py
@@ -17,6 +17,7 @@
 )
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
 from executorch.backends.nxp.tests.executorch_pipeline import (
     _quantize_model,
     get_random_calibration_inputs,
@@ -39,8 +40,11 @@ def _quantize_and_lower_module(
     module: GraphModule, input_shape: tuple[int, ...], target="imxrt700"
 ) -> EdgeProgramManager:
     calibration_inputs = get_random_calibration_inputs(to_model_input_spec(input_shape))
+    quantizer = NeutronQuantizer()
 
-    exir_program_aten__module_quant = _quantize_model(module, calibration_inputs)
+    exir_program_aten__module_quant = _quantize_model(
+        module, quantizer, calibration_inputs
+    )
 
     edge_compile_config = EdgeCompileConfig(_check_ir_validity=False)
     edge_program_manager = export_to_edge(
@@ -49,7 +53,7 @@ def _quantize_and_lower_module(
         edge_compile_config=edge_compile_config,
     )
 
-    compile_spec = generate_neutron_compile_spec(target, "SDK_25_06")
+    compile_spec = generate_neutron_compile_spec(target, "SDK_25_09")
     partitioner = NeutronPartitioner(compile_spec)
     return edge_program_manager.to_backend(partitioner)
 
@@ -106,7 +110,7 @@ def test_split_group_convolution__2d(self, _, input_shape: list[int], group: int
         input_data = torch.randn(input_shape, dtype=torch.float32)
         out1 = original_module(input_data).detach().numpy()
         out2 = modified_module(input_data).detach().numpy()
-        assert np.allclose(out1, out2, atol=2.0e-7)
+        assert np.allclose(out1, out2, atol=2.0e-7, rtol=1.9e-4)
 
         # Make sure the graph can be correctly quantized and lowered to edge.
         ep = _quantize_and_lower_module(
diff --git a/backends/openvino/CMakeLists.txt b/backends/openvino/CMakeLists.txt
index 4d32d8932c2..736ed6d8603 100644
--- a/backends/openvino/CMakeLists.txt
+++ b/backends/openvino/CMakeLists.txt
@@ -53,35 +53,11 @@ target_sources(
 
 executorch_target_link_options_shared_lib(openvino_backend)
 
-if(EXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER)
-  # Build executor runner binary for openvino backend
-  list(APPEND openvino_executor_runner_libs openvino_backend executorch)
-
-  set(_openvino_executor_runner__srcs
-      ${EXECUTORCH_ROOT}/examples/portable/executor_runner/executor_runner.cpp
-      ${EXECUTORCH_ROOT}/extension/data_loader/file_data_loader.cpp
-      ${EXECUTORCH_ROOT}/extension/evalue_util/print_evalue.cpp
-      ${EXECUTORCH_ROOT}/extension/runner_util/inputs.cpp
-      ${EXECUTORCH_ROOT}/extension/runner_util/inputs_portable.cpp
-  )
-  add_executable(openvino_executor_runner ${_openvino_executor_runner__srcs})
-
-  list(APPEND openvino_executor_runner_libs)
-
-  target_link_libraries(
-    openvino_executor_runner gflags portable_ops_lib
-    ${openvino_executor_runner_libs}
-  )
-  target_compile_options(
-    openvino_executor_runner PUBLIC ${_common_compile_options}
-  )
-endif()
-
 # Install OpenVINO backend library to the lib directory
 install(
   TARGETS openvino_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
diff --git a/backends/openvino/README.md b/backends/openvino/README.md
index 0046ad23486..5ce38ade56f 100644
--- a/backends/openvino/README.md
+++ b/backends/openvino/README.md
@@ -105,7 +105,7 @@ Follow the steps below to setup your build environment:
      ```bash
    ./openvino_build.sh --enable_python
    ```
-   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `<executorch_root>/cmake-out` directory. The binary located at `<executorch_root>/cmake-out/backends/openvino/openvino_executor_runner` can be used to run inference with vision models.
+   **Build C++ Runtime Libraries for OpenVINO Backend**: Run the `openvino_build.sh` script with the `--cpp_runtime` flag to build the C++ runtime libraries as shown in the below command. The compiled libraries files and binaries can be found in the `<executorch_root>/cmake-out` directory. The binary located at `<executorch_root>/cmake-out/executor_runner` can be used to run inference with vision models.
      ```bash
    ./openvino_build.sh --cpp_runtime
    ```
diff --git a/backends/openvino/partitioner.py b/backends/openvino/partitioner.py
index 00107959412..0d407e33f6e 100644
--- a/backends/openvino/partitioner.py
+++ b/backends/openvino/partitioner.py
@@ -36,6 +36,7 @@ def __init__(self):
 class OpenvinoOperatorsSupport(OperatorSupportBase):
     extended_support_dict = {
         "torch.ops.dim_order_ops._clone_dim_order.default": None,
+        "torch.ops.dim_order_ops._to_dim_order_copy.default": None,
     }
 
     def __init__(
diff --git a/backends/openvino/preprocess.py b/backends/openvino/preprocess.py
index 72c781c0fb3..691115f6579 100644
--- a/backends/openvino/preprocess.py
+++ b/backends/openvino/preprocess.py
@@ -8,13 +8,14 @@
 
 from typing import final, List
 
-from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from openvino.frontend.pytorch.torchdynamo.compile import (  # type: ignore[import-untyped]
     openvino_compile,
 )
@@ -37,8 +38,7 @@ def preprocess(
         Returns:
             PreprocessResult: The result of preprocessing, including the compiled model bytes.
         """
-        # Apply RemoveCloneOpsTransform to eliminate unnecessary clone operations
-        transformed_ep = RemoveCloneOpsTransform()(edge_program.graph_module)
+        transformed_ep = DimOrderOpsRevertPass()(edge_program.graph_module)
 
         # Update the edge_program with the transformed graph
         if transformed_ep and transformed_ep.graph_module:
diff --git a/backends/openvino/scripts/openvino_build.sh b/backends/openvino/scripts/openvino_build.sh
index b7e5f5270ab..6d7853b96e5 100755
--- a/backends/openvino/scripts/openvino_build.sh
+++ b/backends/openvino/scripts/openvino_build.sh
@@ -30,10 +30,11 @@ build_cpp_runtime() {
           -DEXECUTORCH_BUILD_OPENVINO=ON \
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
           -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
           -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-          -DEXECUTORCH_BUILD_OPENVINO_EXECUTOR_RUNNER=ON \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
           -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
           -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
           -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 32105597260..07166b92ea2 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -214,7 +214,7 @@ add_subdirectory(
 install(
   TARGETS qnn_executorch_backend
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
 # QNN pybind
diff --git a/backends/qualcomm/__init__.py b/backends/qualcomm/__init__.py
index 04ba5fcf24b..5770dfb0fcd 100644
--- a/backends/qualcomm/__init__.py
+++ b/backends/qualcomm/__init__.py
@@ -1,23 +1,13 @@
 import os
 
-from .scripts.download_qnn_sdk import (
-    check_glibc_exist_and_validate,
-    install_qnn_sdk,
-    is_linux_x86,
-)
+from .scripts.download_qnn_sdk import install_qnn_sdk, is_linux_x86
 
 
 env_flag = os.getenv("EXECUTORCH_BUILDING_WHEEL", "0").lower()
 # If users have preinstalled QNN_SDK_ROOT, we will use it.
 qnn_sdk_root_flag = os.getenv("QNN_SDK_ROOT", None)
 
-if (
-    env_flag not in ("1", "true", "yes")
-    and not qnn_sdk_root_flag
-    and is_linux_x86()
-    and check_glibc_exist_and_validate()
-):
+if env_flag not in ("1", "true", "yes") and not qnn_sdk_root_flag and is_linux_x86():
     ok = install_qnn_sdk()
-
     if not ok:
         raise RuntimeError("Failed to install QNN SDK. Please check the logs above.")
diff --git a/backends/qualcomm/_passes/TARGETS b/backends/qualcomm/_passes/TARGETS
index 62a0fc43a78..876b51d3863 100644
--- a/backends/qualcomm/_passes/TARGETS
+++ b/backends/qualcomm/_passes/TARGETS
@@ -15,5 +15,6 @@ runtime.python_library(
         "//executorch/backends/transforms:decompose_sdpa",
         "//executorch/exir/backend:backend_details",
         "//executorch/exir/backend:compile_spec_schema",
+        "//executorch/backends/qualcomm/quantizer:quantizer",
     ],
 )
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
index 15fce79ea12..154a360689e 100644
--- a/backends/qualcomm/_passes/__init__.py
+++ b/backends/qualcomm/_passes/__init__.py
@@ -13,14 +13,18 @@
 from .convert_linear_to_conv2d import ConvertLinearToConv2d
 from .convert_square_to_pow import ConvertSquareToPow
 from .decompose_any import DecomposeAny
+from .decompose_binary_alpha import DecomposeBinaryAlpha
 from .decompose_cdist import DecomposeCDist
 from .decompose_col_im import DecomposeColIm
 from .decompose_einsum import DecomposeEinsum
 from .decompose_expm1 import DecomposeExpM1
+from .decompose_floor_divide import DecomposeFloorDivide
+from .decompose_glu import DecomposeGlu
 from .decompose_linalg_vector_norm import DecomposeLinalgVectorNorm
 from .decompose_minmaxdim import DecomposeMinMaxDim
 from .decompose_roll import DecomposeRoll
 from .decompose_silu import DecomposeSilu
+from .decompose_threshold import DecomposeThreshold
 from .decompose_wrap_with_autocast import DecomposeWrapWithAutocast
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
@@ -30,6 +34,7 @@
 from .i64_to_i32 import I64toI32
 from .insert_io_qdq import InsertIOQDQ
 from .insert_requantize import InsertRequantize
+from .insert_reshape_for_reduce_ops import InsertReshapeForReduceOps
 from .layout_transform import LayoutTransform
 from .lift_constant_scalar_operands import LiftConstantScalarOperands
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
@@ -42,7 +47,6 @@
 from .seq_mse import SeqMSE
 from .tag_quant_io import TagQuantIO
 
-
 __all__ = [
     AnnotateAdaptiveAvgPool1D,
     AnnotateQuantAttrs,
@@ -53,14 +57,18 @@
     ConvertLinearToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
+    DecomposeBinaryAlpha,
     DecomposeCDist,
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFloorDivide,
+    DecomposeGlu,
     DecomposeLinalgVectorNorm,
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeThreshold,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
@@ -69,6 +77,7 @@
     FuseConsecutiveTranspose,
     I64toI32,
     InsertIOQDQ,
+    InsertReshapeForReduceOps,
     InsertRequantize,
     LayoutTransform,
     LiftConstantScalarOperands,
diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py
index 610e88e6d3b..6077d51b099 100644
--- a/backends/qualcomm/_passes/annotate_quant_attrs.py
+++ b/backends/qualcomm/_passes/annotate_quant_attrs.py
@@ -19,6 +19,7 @@
     QCOM_SCALE,
     QCOM_ZERO_POINT,
 )
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
 from .utils import get_quant_attrs
@@ -38,6 +39,9 @@ def __init__(
         super(AnnotateQuantAttrs, self).__init__()
         self.edge_program = edge_program
         self.skip_advanced_requant = skip_advanced_requant
+        self.skip_requant_allowlist = {
+            exir_ops.edge.aten.sigmoid.default,
+        }
 
     def _annotate_source_nodes(
         self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any]
@@ -80,6 +84,10 @@ def _annotate_requant(self, n):
         # node1 -> q_ui8 (n) -> dq_ui8 -> q_int32 -> dq_int32 -> node2 -> ....
         # We store {node2: quant_attr in dq_int32} in node1.meta
         if n.target in q_ops and n.args[0].target not in dq_ops:
+            # for some fixed scale op, there is no need to requantize it
+            if n.args[0].target in self.skip_requant_allowlist:
+                return
+
             dq_nodes = self._find_last_dq_nodes(n)
             q_attrs = get_quant_attrs(self.edge_program, n)
             for dq_node in dq_nodes:
diff --git a/backends/qualcomm/_passes/canonicalize_conv.py b/backends/qualcomm/_passes/canonicalize_conv.py
index 3804fb05da0..dc5c26c1a94 100644
--- a/backends/qualcomm/_passes/canonicalize_conv.py
+++ b/backends/qualcomm/_passes/canonicalize_conv.py
@@ -34,6 +34,7 @@ def __init__(self, edge_program: torch.export.ExportedProgram):
         self.transpose_conv_set = {
             torch.ops.aten.conv_transpose1d.default,
             torch.ops.aten.conv_transpose2d.input,
+            torch.ops.aten.conv_transpose3d.input,
         }
 
     def dilate(self, tensor, dilation):
diff --git a/backends/qualcomm/_passes/decompose_any.py b/backends/qualcomm/_passes/decompose_any.py
index e92bf11dd18..0cb959ff77f 100644
--- a/backends/qualcomm/_passes/decompose_any.py
+++ b/backends/qualcomm/_passes/decompose_any.py
@@ -8,6 +8,8 @@
 from executorch.exir import to_edge
 from executorch.exir.pass_base import ExportPass, PassResult
 
+from .utils import merge_decomposed_graph
+
 
 class Any(torch.nn.Module):
     def __init__(self, dim, keepdim):
@@ -49,26 +51,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     # remap is used to map original node values to new node values,
                     # which ensures that reference to nodes are correctly updated in the new graph
                     remap = {"x": node.args[0]}
-
-                    for decomposed_node in decomposed_module.graph.nodes:
-                        # no need to copy existent 'output'
-                        if decomposed_node.op == "output":
-                            for user in node.users.copy():
-                                # remap
-                                user.replace_input_with(
-                                    node,
-                                    remap[decomposed_node.args[0][0]],
-                                )
-                        # no need to copy existent placeholders
-                        elif decomposed_node.op == "placeholder":
-                            # replace node map from string to graph node
-                            remap[decomposed_node] = remap.pop(decomposed_node.name)
-                        else:
-                            remap[decomposed_node] = graph.node_copy(
-                                decomposed_node,
-                                arg_transform=lambda x, remap=remap: remap[x],
-                            )
-
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
                     graph.erase_node(node)
 
         graph.eliminate_dead_code()
diff --git a/backends/qualcomm/_passes/decompose_binary_alpha.py b/backends/qualcomm/_passes/decompose_binary_alpha.py
new file mode 100644
index 00000000000..df767f10ca9
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_binary_alpha.py
@@ -0,0 +1,61 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+decomp_set = {torch.ops.aten.add.Tensor, torch.ops.aten.sub.Tensor}
+
+
+class DecomposeBinaryAlpha(ExportPass):
+    """
+    QNN does not support alpha parameter for add/sub.
+    Decompose to mul + add / mul + sub
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if (
+                node.target in decomp_set
+                and "alpha" in node.kwargs
+                and node.kwargs["alpha"] != 1
+            ):
+                alpha = node.kwargs["alpha"]
+                # Remove alpha from immutable dict
+                node.kwargs = {k: v for k, v in node.kwargs.items() if k != "alpha"}
+                input2_node = node.args[1]
+                # If input2 is constant, we can just multiply the value for optimization
+                if isinstance(input2_node, (int, float)):
+                    arg_list = list(node.args)
+                    arg_list[1] = input2_node * alpha
+                    node.args = tuple(arg_list)
+                    continue
+                with graph.inserting_before(node):
+                    mul_op = torch.ops.aten.mul.Scalar
+                    mul_node = graph.create_node(
+                        "call_function",
+                        mul_op,
+                        (
+                            input2_node,
+                            alpha,
+                        ),
+                    )
+                    mul_node.meta = copy_meta(node.meta)
+                    node.replace_input_with(input2_node, mul_node)
+                    node.args = (
+                        node.args[0],
+                        mul_node,
+                    )
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_cdist.py b/backends/qualcomm/_passes/decompose_cdist.py
index d18a0295ffb..a3c812bdc37 100644
--- a/backends/qualcomm/_passes/decompose_cdist.py
+++ b/backends/qualcomm/_passes/decompose_cdist.py
@@ -7,6 +7,8 @@
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
 
+from .utils import merge_decomposed_graph
+
 
 class CDist(torch.nn.Module):
     def __init__(self):
@@ -54,26 +56,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     # remap is used to map original node values to new node values,
                     # which ensures that reference to nodes are correctly updated in the new graph
                     remap = {"x": node.args[0], "y": node.args[1]}
-
-                    for decomposed_node in decomposed_module.graph.nodes:
-                        # no need to copy existent 'output'
-                        if decomposed_node.op == "output":
-                            for user in node.users.copy():
-                                # remap
-                                user.replace_input_with(
-                                    node,
-                                    remap[decomposed_node.args[0][0]],
-                                )
-                        # no need to copy existent placeholders
-                        elif decomposed_node.op == "placeholder":
-                            # replace node map from string to graph node
-                            remap[decomposed_node] = remap.pop(decomposed_node.name)
-                        else:
-                            remap[decomposed_node] = graph.node_copy(
-                                decomposed_node,
-                                arg_transform=lambda x, remap=remap: remap[x],
-                            )
-
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
                     graph.erase_node(node)
 
         graph.eliminate_dead_code()
diff --git a/backends/qualcomm/_passes/decompose_einsum.py b/backends/qualcomm/_passes/decompose_einsum.py
index 046c1598311..464d989333f 100644
--- a/backends/qualcomm/_passes/decompose_einsum.py
+++ b/backends/qualcomm/_passes/decompose_einsum.py
@@ -8,7 +8,7 @@
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.experimental.proxy_tensor import make_fx
 
-from .utils import copy_nn_module_stack
+from .utils import merge_decomposed_graph
 
 
 class DecomposeEinsum(ExportPass):
@@ -37,30 +37,13 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     for i, arg in enumerate(node.args[1]):
                         remap[f"arg1_{i+1}"] = arg
 
-                    for decomposed_node in decomposed_module.graph.nodes:
-                        copy_nn_module_stack(node, decomposed_node)
-                        # This is the arg[0] equation string, which is not required anymore after decomposition
-                        if "arg0" in decomposed_node.name:
-                            continue
-
-                        # no need to copy existent 'output'
-                        if decomposed_node.op == "output":
-                            for user in node.users.copy():
-                                # remap
-                                user.replace_input_with(
-                                    node,
-                                    remap[decomposed_node.args[0][0]],
-                                )
-                        # no need to copy existent placeholders
-                        elif decomposed_node.op == "placeholder":
-                            # replace node map from string to graph node
-                            remap[decomposed_node] = remap.pop(decomposed_node.name)
-                        else:
-                            remap[decomposed_node] = graph.node_copy(
-                                decomposed_node,
-                                arg_transform=lambda x, remap=remap: remap[x],
-                            )
-
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                        predicate=lambda decomp_node: "arg0" not in decomp_node.name,
+                    )
                     graph.erase_node(node)
 
         graph.eliminate_dead_code()
diff --git a/backends/qualcomm/_passes/decompose_floor_divide.py b/backends/qualcomm/_passes/decompose_floor_divide.py
new file mode 100644
index 00000000000..f7de074259e
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_floor_divide.py
@@ -0,0 +1,62 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class FloorDivide(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        dtype = x.dtype
+        result = torch.div(x, y)
+        result = torch.floor(result)
+        return result.to(dtype)
+
+
+class DecomposeFloorDivide(ExportPass):
+    """
+    Decompose for math equivalent op.
+    Since QNN does not support floor_divide operations for int32 or int64 inputs,
+    it is necessary to decompose the operation into a division using floating-point precision,
+    followed by applying the floor function.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            model = FloorDivide()
+            if (
+                torch.ops.aten.floor_divide.default == node.target
+                and not torch.is_floating_point(node.meta["val"])
+            ):
+                decomposed_module = torch.export.export(
+                    model,
+                    (node.args[0].meta["val"], node.args[1].meta["val"]),
+                    strict=True,
+                ).module()
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0], "y": node.args[1]}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_glu.py b/backends/qualcomm/_passes/decompose_glu.py
new file mode 100644
index 00000000000..de363468799
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_glu.py
@@ -0,0 +1,55 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+# this wrapper is required for IO name mapping with decomposed graph
+class Glu(torch.nn.Module):
+    def __init__(self, dim=-1):
+        super().__init__()
+        self.glu = torch.nn.GLU(dim=dim)
+
+    def forward(self, x):
+        return self.glu(x)
+
+
+class DecomposeGlu(ExportPass):
+    """
+    Decompose glu for quantization annotation to work properly.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.glu.default:
+                ep = torch.export.export(
+                    Glu(dim=-1 if len(node.args) < 2 else node.args[1]),
+                    (node.args[0].meta["val"],),
+                )
+                decomposed_module = ep.run_decompositions().graph_module
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": node.args[0]}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
index 993f088da12..94a5b10ba3f 100644
--- a/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
+++ b/backends/qualcomm/_passes/decompose_linalg_vector_norm.py
@@ -8,7 +8,7 @@
 from executorch.exir import to_edge
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_nn_module_stack
+from .utils import merge_decomposed_graph
 
 
 class LinalgVectorNorm(torch.nn.Module):
@@ -62,27 +62,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     # remap is used to map original node values to new node values,
                     # which ensures that reference to nodes are correctly updated in the new graph
                     remap = {"x": node.args[0]}
-
-                    for decomposed_node in decomposed_module.graph.nodes:
-                        copy_nn_module_stack(node, decomposed_node)
-                        # no need to copy existent 'output'
-                        if decomposed_node.op == "output":
-                            for user in node.users.copy():
-                                # remap
-                                user.replace_input_with(
-                                    node,
-                                    remap[decomposed_node.args[0][0]],
-                                )
-                        # no need to copy existent placeholders
-                        elif decomposed_node.op == "placeholder":
-                            # replace node map from string to graph node
-                            remap[decomposed_node] = remap.pop(decomposed_node.name)
-                        else:
-                            remap[decomposed_node] = graph.node_copy(
-                                decomposed_node,
-                                arg_transform=lambda x, remap=remap: remap[x],
-                            )
-
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
                     graph.erase_node(node)
 
         graph.eliminate_dead_code()
diff --git a/backends/qualcomm/_passes/decompose_roll.py b/backends/qualcomm/_passes/decompose_roll.py
index e13433508f5..e6f60d55464 100644
--- a/backends/qualcomm/_passes/decompose_roll.py
+++ b/backends/qualcomm/_passes/decompose_roll.py
@@ -7,7 +7,7 @@
 
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_nn_module_stack
+from .utils import merge_decomposed_graph
 
 
 class SliceCopy(torch.nn.Module):
@@ -65,27 +65,12 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                     # remap is used to map original node values to new node values,
                     # which ensures that reference to nodes are correctly updated in the new graph
                     remap = {"x": input_node}
-
-                    for decomposed_node in decomposed_module.graph.nodes:
-                        copy_nn_module_stack(node, decomposed_node)
-                        # no need to copy existent 'output'
-                        if decomposed_node.op == "output":
-                            for user in node.users.copy():
-                                # remap
-                                user.replace_input_with(
-                                    node,
-                                    remap[decomposed_node.args[0][0]],
-                                )
-                        # no need to copy existent placeholders
-                        elif decomposed_node.op == "placeholder":
-                            # replace node map from string to graph node
-                            remap[decomposed_node] = remap.pop(decomposed_node.name)
-                        else:
-                            remap[decomposed_node] = graph.node_copy(
-                                decomposed_node,
-                                arg_transform=lambda x, remap=remap: remap[x],
-                            )
-
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
                     graph.erase_node(node)
 
         graph.eliminate_dead_code()
diff --git a/backends/qualcomm/_passes/decompose_threshold.py b/backends/qualcomm/_passes/decompose_threshold.py
new file mode 100644
index 00000000000..0f0a1bc4ea8
--- /dev/null
+++ b/backends/qualcomm/_passes/decompose_threshold.py
@@ -0,0 +1,61 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import merge_decomposed_graph
+
+
+class DecomposeModule(torch.nn.Module):
+    def __init__(self, threshold, value):
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+
+    def forward(self, x):
+        return torch.where(x <= self.threshold, self.value, x)
+
+
+class DecomposeThreshold(ExportPass):
+    """
+    Decompose threshold to less_equal and where.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target in {
+                torch.ops.aten.threshold_.default,
+                torch.ops.aten.threshold.default,
+            }:
+                input_node = node.args[0]
+                threshold = node.args[1]
+                value = node.args[2]
+
+                model = DecomposeModule(threshold, value)
+                decomposed_module = torch.export.export(
+                    model, (input_node.meta["val"],), strict=True
+                ).module()
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correctly updated in the new graph
+                    remap = {"x": input_node}
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                    )
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/decompose_wrap_with_autocast.py b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py
index 6c073bd309c..1b60b740ed3 100644
--- a/backends/qualcomm/_passes/decompose_wrap_with_autocast.py
+++ b/backends/qualcomm/_passes/decompose_wrap_with_autocast.py
@@ -10,7 +10,7 @@
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import copy_nn_module_stack
+from .utils import merge_decomposed_graph
 
 
 class DecomposeWrapWithAutocast(ExportPass):
@@ -52,7 +52,7 @@ def _replace(self, gm: torch.fx.GraphModule) -> None:
         graph = gm.graph
         for node in graph.nodes:
             if isinstance(node.target, torch._higher_order_ops.wrap.WrapWithAutocast):
-                submod, submod_name = self._get_submod(gm, node)
+                submod, _ = self._get_submod(gm, node)
                 n_args = node.args
                 input_submod = n_args[4]
                 decomposed_module = submod
@@ -61,22 +61,13 @@ def _replace(self, gm: torch.fx.GraphModule) -> None:
                     # which ensures that reference to nodes are correctly updated in the new graph
                     # remap = {"expand_1": node.args[5], "to_4": node.args[6]}
                     remap = {n_args[i].name: n_args[i] for i in range(5, len(n_args))}
-
-                    for decomposed_node in decomposed_module.graph.nodes:
-                        copy_nn_module_stack(node, decomposed_node)
-                        # no need to copy existent 'output'
-                        if decomposed_node.op == "output":
-                            self._replace_output(node, decomposed_node, remap)
-                        # no need to copy existent placeholders
-                        elif decomposed_node.op == "placeholder":
-                            # replace node map from string to graph node
-                            remap[decomposed_node] = remap.pop(decomposed_node.name)
-                        else:
-                            remap[decomposed_node] = graph.node_copy(
-                                decomposed_node,
-                                arg_transform=lambda x, remap=remap: remap[x],
-                            )
-
+                    merge_decomposed_graph(
+                        remap=remap,
+                        target_node=node,
+                        target_graph=graph,
+                        decomposed_graph_module=decomposed_module,
+                        output_processor=self._replace_output,
+                    )
                     graph.erase_node(node)
 
                 graph.erase_node(input_submod)
diff --git a/backends/qualcomm/_passes/fixed_linear_keep_dim.py b/backends/qualcomm/_passes/fixed_linear_keep_dim.py
index 19f5c631921..04c0f92cebf 100644
--- a/backends/qualcomm/_passes/fixed_linear_keep_dim.py
+++ b/backends/qualcomm/_passes/fixed_linear_keep_dim.py
@@ -5,10 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 
+from executorch.backends.qualcomm.builders.node_visitor import dq_ops
+from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.passes import dead_code_elimination_pass
 
+from .utils import copy_meta, get_quant_attrs
+
 
 class FixedLinearKeepDim(ExportPass):
     """
@@ -18,8 +22,12 @@ class FixedLinearKeepDim(ExportPass):
     view_copy = exir_ops.edge.aten.view_copy.default
     linear = exir_ops.edge.aten.linear.default
 
-    def __init__(self):
+    def __init__(
+        self,
+        edge_program: torch.export.ExportedProgram,
+    ):
         super(FixedLinearKeepDim, self).__init__()
+        self.edge_program = edge_program
 
     def _fixed_keep_dim(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
@@ -46,9 +54,15 @@ def _fixed_keep_dim(self, graph_module: torch.fx.GraphModule):
                     )
                     # meta needs to be copied elementwisely for fake-tensor
                     # to be updated correctly and not affect meta of input_node
-                    for k, v in input_node.meta.items():
-                        squeeze_node.meta[k] = v
+                    squeeze_node.meta = copy_meta(input_node.meta)
                     squeeze_node.meta["val"] = input_tensor.reshape(squeeze_dim)
+                    # if input_node is dequantize, we need to fetch encodings manually
+                    # TODO: remove this when constant fold mechanism is introduced
+                    if input_node.target in dq_ops:
+                        squeeze_node.meta[QCOM_QUANT_ATTRS] = get_quant_attrs(
+                            self.edge_program, input_node
+                        )
+
                     for user in input_users:
                         if user == linear_node:
                             user.replace_input_with(input_node, squeeze_node)
@@ -66,8 +80,7 @@ def _fixed_keep_dim(self, graph_module: torch.fx.GraphModule):
                     )
                     # meta needs to be copied elementwisely for fake-tensor
                     # to be updated correctly and not affect meta of unsqueeze_node
-                    for k, v in linear_node.meta.items():
-                        unsqueeze_node.meta[k] = v
+                    unsqueeze_node.meta = copy_meta(linear_node.meta)
                     # update linear node's shape
                     linear_node.meta["val"] = linear_output.reshape(
                         (squeeze_node.meta["val"].shape[0], linear_output.shape[-1])
diff --git a/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
new file mode 100644
index 00000000000..52f9546c28e
--- /dev/null
+++ b/backends/qualcomm/_passes/insert_reshape_for_reduce_ops.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+
+class InsertReshapeForReduceOps(ExportPass):
+    """
+    Rewrite `aten.argmax.default` with `dim=None` into
+    a reshape-to-1D followed by argmax(dim=0).
+
+    PyTorch semantics:
+      torch.argmax(x, dim=None) -> flatten(x) then argmax along axis=0
+
+    QNN requires an explicit axis, so we insert the reshape.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.op_map = {torch.ops.aten.argmax.default, torch.ops.aten.argmin.default}
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        modified = False
+
+        for n in graph.nodes:
+            if n.target in self.op_map:
+                dim_arg = None if len(n.args) == 1 else n.args[1]
+
+                if dim_arg is None:
+                    inp = n.args[0]
+
+                    # Insert reshape before argmax
+                    with graph.inserting_before(n):
+                        reshape_node = graph.create_node(
+                            "call_function",
+                            torch.ops.aten.reshape.default,
+                            (inp, [-1]),
+                            {},
+                        )
+                        reshape_node.meta = dict(inp.meta)
+                        if "val" in inp.meta:
+                            reshape_node.meta["val"] = inp.meta["val"].reshape(-1)
+
+                    # Rewrite argmax: take reshape_node as input, set dim=0
+                    n.args = (reshape_node, 0, *n.args[2:])
+
+                modified = True
+
+        if modified:
+            graph_module.recompile()
+            dead_code_elimination_pass(graph_module)
+
+        return PassResult(graph_module, modified)
diff --git a/backends/qualcomm/_passes/lift_constant_scalar_operands.py b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
index f5c5915cab2..52bdf7fa090 100644
--- a/backends/qualcomm/_passes/lift_constant_scalar_operands.py
+++ b/backends/qualcomm/_passes/lift_constant_scalar_operands.py
@@ -51,6 +51,7 @@ class TensorOpInfo:
     # The scalar number arg[1] is missing when using default. Result in a corner case to deal
     aten.leaky_relu.default: TensorOpInfo(aten.prelu.default, True, False),
     aten.leaky_relu_.default: TensorOpInfo(aten.prelu.default, True, False),
+    aten.where.ScalarSelf: TensorOpInfo(aten.where.self, False, True),
     aten.where.ScalarOther: TensorOpInfo(aten.where.self, False, True),
     aten.where.Scalar: TensorOpInfo(aten.where.self, False, True),
     aten.masked_fill.Scalar: TensorOpInfo(aten.masked_fill.Tensor, False, False),
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
index ffb9f3221df..360581a2929 100644
--- a/backends/qualcomm/_passes/qnn_pass_manager.py
+++ b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -18,14 +18,18 @@
     ConvertLinearToConv2d,
     ConvertSquareToPow,
     DecomposeAny,
+    DecomposeBinaryAlpha,
     DecomposeCDist,
     DecomposeColIm,
     DecomposeEinsum,
     DecomposeExpM1,
+    DecomposeFloorDivide,
+    DecomposeGlu,
     DecomposeLinalgVectorNorm,
     DecomposeMinMaxDim,
     DecomposeRoll,
     DecomposeSilu,
+    DecomposeThreshold,
     DecomposeWrapWithAutocast,
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
@@ -35,6 +39,7 @@
     I64toI32,
     InsertIOQDQ,
     InsertRequantize,
+    InsertReshapeForReduceOps,
     LayoutTransform,
     LiftConstantScalarOperands,
     RecomposePixelUnshuffle,
@@ -193,26 +198,37 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(RecomposePixelUnshuffle(quantization_capture=True))
         self.add_pass(RecomposeRmsNorm(quantization_capture=True))
         self.add_pass(ReplaceArangeArgs())
+        self.add_pass(DecomposeBinaryAlpha())
         self.add_pass(DecomposeCDist())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
         self.add_pass(DecomposeSilu())
+        self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeWrapWithAutocast())
         self.add_pass(DecomposeEinsum())
         self.add_pass(DecomposeExpM1())
+        self.add_pass(DecomposeGlu())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(ReplaceInfValues())
         self.add_pass(LiftConstantScalarOperands())
+        self.add_pass(InsertReshapeForReduceOps())
         return self._transform(graph_module)
 
     def transform_for_export_pipeline(
         self, exported_program: ExportedProgram, convert_linear_to_conv2d: bool = False
     ):
+        self.add_pass(DecomposeBinaryAlpha())
         self.add_pass(DecomposeCDist())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoll())
+        self.add_pass(DecomposeThreshold())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        # DecomposeFloorDivide does not apply to the annotation pipeline,
+        # since the CPU QDQ model would reduce accuracy.
+        # We keep div and floor operations in floating-point to maintain precision.
+        # This pass is needed before to_edge pipeline to avoid mixed type for div operator with RemoveMixedTypeOperators pass.
+        self.add_pass(DecomposeFloorDivide())
         self.add_pass(DecomposeWrapWithAutocast())
         # this pass will rewrite state_dict, it needs to be accomplished before
         # to_edge_transform_and_lower
@@ -221,6 +237,7 @@ def transform_for_export_pipeline(
             self.add_pass(ConvertLinearToConv2d(exported_program))
         self.add_pass(ConvertSquareToPow())
         self.add_pass(LiftConstantScalarOperands())
+        self.add_pass(InsertReshapeForReduceOps())
         self._transform(exported_program.graph_module)
         ep = lift_constant_tensor_pass(exported_program)
         return ep
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
index 6d908707892..eebfa4d9eb4 100755
--- a/backends/qualcomm/_passes/utils.py
+++ b/backends/qualcomm/_passes/utils.py
@@ -117,6 +117,45 @@ def copy_nn_module_stack(src, target):
         target.meta["nn_module_stack"] = value
 
 
+def merge_decomposed_graph(
+    remap: Dict[str, torch.fx.Node],
+    target_node: torch.fx.Node,
+    target_graph: torch.fx.GraphModule,
+    decomposed_graph_module: torch.fx.GraphModule,
+    predicate: Callable[[torch.fx.Node], None] = None,
+    # target_node, decomposed_output_node, remap
+    output_processor: Callable[
+        [torch.fx.Node, torch.fx.Node, Dict[str, torch.fx.Node]], None
+    ] = None,
+) -> None:
+    def default_output_process(node):
+        for user in node.users.copy():
+            # remap
+            user.replace_input_with(
+                node,
+                remap[decomposed_node.args[0][0]],
+            )
+
+    for decomposed_node in decomposed_graph_module.graph.nodes:
+        copy_nn_module_stack(target_node, decomposed_node)
+        if predicate is None or predicate(decomposed_node):
+            # no need to copy existent 'output'
+            if decomposed_node.op == "output":
+                if output_processor is None:
+                    default_output_process(target_node)
+                else:
+                    output_processor(target_node, decomposed_node, remap)
+            # no need to copy existent placeholders
+            elif decomposed_node.op == "placeholder":
+                # replace node map from string to graph node
+                remap[decomposed_node] = remap.pop(decomposed_node.name)
+            else:
+                remap[decomposed_node] = target_graph.node_copy(
+                    decomposed_node,
+                    arg_transform=lambda x, remap=remap: remap[x],
+                )
+
+
 def is_float_tensor(node: torch.fx.Node) -> bool:
     if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
         return False
diff --git a/backends/qualcomm/builders/README.md b/backends/qualcomm/builders/README.md
index 6ba4eafb01f..61ae1061214 100644
--- a/backends/qualcomm/builders/README.md
+++ b/backends/qualcomm/builders/README.md
@@ -365,7 +365,7 @@ Please help update following table if you are contributing new operators:
 + &#128683; = Deprecated, supported with other QNN Ops
 
 
-| Operators | HTP - 90/116 Enabled |
+| Operators | HTP - 92/116 Enabled |
 |-----------|---------|
 | Argmax | &check; |
 | Argmin | &check; |
@@ -375,7 +375,7 @@ Please help update following table if you are contributing new operators:
 | ChannelShuffle | &cross; |
 | Concat | &check; |
 | Conv2d | &check; |
-| Conv3d | &cross; |
+| Conv3d | &check; |
 | Convert | &check; |
 | CreateSparse | &cross; |
 | CumulativeSum | &check; |
@@ -481,7 +481,7 @@ Please help update following table if you are contributing new operators:
 | TopK | &check; |
 | TransPose | &check; |
 | TransPoseConv2d | &check; |
-| TransPoseConv3d | &cross; |
+| TransPoseConv3d | &check; |
 | Unpack | &check; |
 
 ## Issues
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
index 9800fb7bdab..3fa8ae067fa 100644
--- a/backends/qualcomm/builders/__init__.py
+++ b/backends/qualcomm/builders/__init__.py
@@ -24,7 +24,7 @@
     op_cat,
     op_ceil,
     op_clamp,
-    op_conv2d,
+    op_conv,
     op_copy,
     op_cos,
     op_cum_sum,
@@ -129,7 +129,7 @@
     op_cat,
     op_ceil,
     op_clamp,
-    op_conv2d,
+    op_conv,
     op_copy,
     op_cos,
     op_cum_sum,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
index bc2b62c8c0b..8cbf3a50e22 100644
--- a/backends/qualcomm/builders/node_visitor.py
+++ b/backends/qualcomm/builders/node_visitor.py
@@ -176,7 +176,7 @@ def make_qnn_per_block_config(self, node: torch.fx.Node, quant_attrs: Dict):
         user_0 = self.get_first_user(node)
         if "convolution" in user_0.target.__name__:
             # OIHW (pytorch) -> HWIO (QNN)
-            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
             quant_config[QCOM_AXIS_ORDER] = (2, 3, 1, 0)
         elif "linear" in user_0.target.__name__:
             # OI (pytorch) -> OI (QNN)
@@ -218,7 +218,7 @@ def make_qnn_per_channel_config(self, node: torch.fx.Node, quant_attrs: Dict):
         user_0 = self.get_first_user(node)
         # Memory layout of QNN conv weight always ends in Output. Like conv2d is HWIO
         if "convolution" in user_0.target.__name__:
-            quant_config[QCOM_AXIS] = 3
+            quant_config[QCOM_AXIS] = node.meta["val"].dim() - 1
         else:
             quant_config[QCOM_AXIS] = quant_attrs[QCOM_AXIS]
 
diff --git a/backends/qualcomm/builders/op_cat.py b/backends/qualcomm/builders/op_cat.py
index 9f6eb6676cf..644b087ab9c 100644
--- a/backends/qualcomm/builders/op_cat.py
+++ b/backends/qualcomm/builders/op_cat.py
@@ -29,14 +29,15 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        list_of_tensors = cast(List[torch.fx.Node], node.args[0])
-        list_of_tensor_wrappers = []
+        input_nodes = cast(List[torch.fx.Node], node.args[0])
+        input_tensor_wrappers = []
 
-        for tensor_input in list_of_tensors:
-            input_tensor = self.get_tensor(self.get_node(tensor_input), node)
-            list_of_tensor_wrappers.append(
+        for input_node in input_nodes:
+            source_input_node = self.get_node(input_node)
+            input_tensor = self.get_tensor(source_input_node, node)
+            input_tensor_wrappers.append(
                 self.define_tensor(
-                    tensor_input,
+                    source_input_node,
                     node,
                     input_tensor,
                     PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
@@ -44,7 +45,7 @@ def define_node(
                 )
             )
 
-        if len(list_of_tensors) != len(list_of_tensor_wrappers):
+        if len(input_nodes) != len(input_tensor_wrappers):
             warnings.warn(
                 "[QNN Delegate Op Builder]: The number or input tensors is not equal to the number of input tensor wrappers.",
                 stacklevel=1,
@@ -76,7 +77,7 @@ def define_node(
             QNN_OP_PACKAGE_NAME_QTI_AISW,
             OpConcat.op_name,
         )
-        concat_op.AddInputTensors(list_of_tensor_wrappers)
+        concat_op.AddInputTensors(input_tensor_wrappers)
         concat_op.AddOutputTensors([output_tensor_wrapper])
 
         concat_op.AddScalarParam(
diff --git a/backends/qualcomm/builders/op_conv2d.py b/backends/qualcomm/builders/op_conv.py
similarity index 82%
rename from backends/qualcomm/builders/op_conv2d.py
rename to backends/qualcomm/builders/op_conv.py
index 1cfc1e45c9b..2bc0b41524d 100644
--- a/backends/qualcomm/builders/op_conv2d.py
+++ b/backends/qualcomm/builders/op_conv.py
@@ -7,7 +7,6 @@
 from typing import cast, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
-
 import numpy as np
 import torch
 from executorch.backends.qualcomm.utils.constants import QCOM_DATA
@@ -16,8 +15,10 @@
 from .node_visitor_manager import register_node_visitor
 from .qnn_constants import (
     OpConv2d,
+    OpConv3d,
     OpDepthWiseConv2d,
     OpTransposeConv2d,
+    OpTransposeConv3d,
     QNN_OP_PACKAGE_NAME_QTI_AISW,
 )
 from .utils import get_parameter
@@ -66,7 +67,7 @@ def _add_conv_op_parameter(
             len(padding_shape),
             padding_shape,
             np.array(
-                [[padding[0], padding[0]], [padding[1], padding[1]]],
+                padding,
                 dtype=np.uint32,
             ),
             True,
@@ -108,8 +109,14 @@ def define_node(
         input_node = self.get_node(node.args[0])
         input_tensor = self.get_tensor(input_node, node)
         assert (
-            input_tensor.dim() == 4
+            input_tensor.dim() != 3
         ), "All Conv1D should be converted to Conv2D in CanonicalizeConv,"
+        assert input_tensor.dim() in {
+            4,
+            5,
+        }, "Only Conv2d and Conv3d is supported in conv builder,"
+
+        is_conv2d = input_tensor.dim() == 4
         input_tensor_wrapper = self.define_tensor(
             input_node,
             node,
@@ -120,9 +127,15 @@ def define_node(
 
         filter_node = self.get_node(node.args[1])
         filter_tensor = get_parameter(filter_node, self.edge_program)
-        # weight of pytorch OIHW(conv2d) | IOHW(conv_transpose2d), yet QNN is HWIO
+        # weight of pytorch OIHW(conv2d) / OIDHW(conv3d) or IOHW(conv_transpose2d) / IODHW(conv_transpose3d),
+        # yet QNN is HWIO or DHWIO
         is_transpose_conv = cast(bool, node.args[6])
-        filter_axis_order = (2, 3, 0, 1) if is_transpose_conv else (2, 3, 1, 0)
+        if is_conv2d:
+            filter_axis_order = (2, 3, 0, 1) if is_transpose_conv else (2, 3, 1, 0)
+        else:
+            filter_axis_order = (
+                (2, 3, 4, 0, 1) if is_transpose_conv else (2, 3, 4, 1, 0)
+            )
         filter_tensor = filter_tensor.permute(dims=filter_axis_order).contiguous()
         filter_tensor_wrapper = self.define_tensor(
             filter_node,
@@ -132,7 +145,6 @@ def define_node(
             nodes_to_wrappers,
         )
         conv_input_tensors = [input_tensor_wrapper, filter_tensor_wrapper]
-
         if node.args[2] is not None:
             bias_node = self.get_node(node.args[2])
             bias_tensor = get_parameter(bias_node, self.edge_program)
@@ -159,11 +171,10 @@ def define_node(
         padding = cast(List[int], node.args[4])
         dilation = cast(List[int], node.args[5])
         output_padding = cast(List[int], node.args[7])
-
         groups = cast(int, node.args[8])
-        # Qnn filter tensor is (H, W, Cin, Cout)
-        group_input_channels = filter_tensor.shape[2]
-        group_output_channels = int(filter_tensor.shape[3] / groups)
+        # Qnn filter tensor is (H, W, Cin, Cout) or (D, H, W, Cin, Cout)
+        group_input_channels = filter_tensor.shape[-2]
+        group_output_channels = int(filter_tensor.shape[-1] / groups)
         # 1) groups = input_channels (i.e. group_input_channels = 1)
         # 2) output_channels is a positive integer multiple of input channels
         # TODO: Currently, negative results will be zero with Depthwise conv2d when input_channel == groups == 1
@@ -175,18 +186,23 @@ def define_node(
         )
         if len(padding) == 1:
             padding = padding + padding
+        padding = [[x, x] for x in padding]
 
         stride_shape = [len(stride)]
-        padding_shape = [2, 2]
+        padding_shape = [len(padding), len(padding[0])]
         dilation_shape = [len(dilation)]
         output_padding_shape = [len(output_padding)]
 
-        if is_depthwise_conv:
+        if is_transpose_conv:
+            assert all(
+                val == 1 for val in dilation
+            ), "CanonicalizeConv pass should perform dilate for transpose_conv."
+            op_class = OpTransposeConv2d if is_conv2d else OpTransposeConv3d
+        elif is_depthwise_conv:
+            assert is_conv2d, "DepthWise only supports Conv2d"
             op_class = OpDepthWiseConv2d
-        elif is_transpose_conv:
-            op_class = OpTransposeConv2d
         else:
-            op_class = OpConv2d
+            op_class = OpConv2d if is_conv2d else OpConv3d
 
         conv_op = PyQnnWrapper.PyQnnOpWrapper(
             node.name,
diff --git a/backends/qualcomm/builders/op_index_put.py b/backends/qualcomm/builders/op_index_put.py
index c3c42ed483a..23481894f0d 100644
--- a/backends/qualcomm/builders/op_index_put.py
+++ b/backends/qualcomm/builders/op_index_put.py
@@ -1,14 +1,19 @@
 import warnings
+from collections import OrderedDict
 from typing import Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 import numpy as np
 import torch
 
-from executorch.backends.qualcomm.utils.constants import QCOM_DATA, QCOM_QUANT_ATTRS
+from executorch.backends.qualcomm.utils.constants import (
+    QCOM_DATA,
+    QCOM_DTYPE,
+    QCOM_QUANT_ATTRS,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 
-from .node_visitor import NodeVisitor, QNN_TENSOR_TYPE_MAP
+from .node_visitor import NodeVisitor, QNN_QUANT_TYPE_MAP, QNN_TENSOR_TYPE_MAP
 from .node_visitor_manager import register_node_visitor
 from .qnn_constants import (
     OpConcat,
@@ -26,7 +31,7 @@ class IndexPutVisitor(NodeVisitor):
     def __init__(self, *args) -> None:
         super().__init__(*args)
 
-    def define_node(
+    def define_node(  # noqa: C901
         self,
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
@@ -37,6 +42,7 @@ def define_node(
         if quant_attrs := node.meta.get(QCOM_QUANT_ATTRS):
             quant_attrs = quant_attrs.copy()
             input_node.meta[QCOM_QUANT_ATTRS] = quant_attrs
+
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
@@ -46,52 +52,110 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        indicies_node = node.args[1]
-        index_node_dim = None
-        index_nodes = []
-        index_tensors = []
+        indices_nodes = (
+            node.args[1] if isinstance(node.args[1], list) else [node.args[1]]
+        )
         target_index = []
+        all_range_index = OrderedDict()
+        index_dtype = [
+            node.meta["val"].dtype for node in indices_nodes if node is not None
+        ][0]
+
+        # preprocess:
+        # - broadcast dimension for multiple specified index
+        # - broadcast specified index if dimensions are not matched
+        max_indices_in_specified_index = 0
+        for index, idx_node in enumerate(indices_nodes):
+            if isinstance(idx_node, torch.fx.Node):
+                last_specified_index_node = index
+                if max_indices_in_specified_index < idx_node.meta["val"].nelement():
+                    max_indices_in_specified_index = idx_node.meta["val"].nelement()
         # If there is None in a list, it means all range at that dimension
-        # E.g., indicies_node: [None, None, aten__to_copy_default_1]
-        if isinstance(indicies_node, list):
-            for index, idx_node in enumerate(indicies_node):
-                # First, collect the indice_node and index of None to construct the shape of index node
-                # E.g., shape of input: [1, 1024, 12, 64]
-                # For "None" axis (assume indicies_node: [None, None, aten__to_copy_default_1]),
-                # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2
-                if isinstance(idx_node, torch.fx.Node):
-                    index_nodes.append(idx_node)
-                    index_tensors.append(self.get_tensor(idx_node, idx_node))
-                    target_index.extend(index_tensors[-1].size())
-                    index_node_dim = index
-                elif idx_node is None and index_node_dim is None:
-                    # E.g., indicies_node: [None, aten__to_copy_default_1, None]
-                    # Don't need to consider "None" after index_node.
-                    target_index.append(input_tensor.size(index))
-                else:
-                    warnings.warn(
-                        f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None",
-                        stacklevel=1,
+        for index, idx_node in enumerate(indices_nodes):
+            # First, collect the index_node and index of None to construct the shape of index node
+            # E.g., shape of input: [1, 1024, 12, 64]
+            # For "None" axis (assume indices_node: [None, None, aten__to_copy_default_1]),
+            # target_index: [1, 1024, x], x is the shape of index_tensor, index_node_dim: 2
+            if isinstance(idx_node, torch.fx.Node):
+                # e.g. for case [index_node_0, None, index_node_1], nodes will have the same number of indices
+                target_index.append(
+                    self.get_tensor(idx_node, idx_node).nelement()
+                    if last_specified_index_node == index
+                    else 1
+                )
+            elif idx_node is None:
+                # E.g., indices_node: [None, None, aten__to_copy_default_1]
+                all_range_index[index] = torch.arange(
+                    input_tensor.size(index), dtype=index_dtype
+                )
+                target_index.append(input_tensor.size(index))
+            else:
+                warnings.warn(
+                    f"[QNN Delegate Op Builder]: Get the index {idx_node} that is neither a node nor None",
+                    stacklevel=1,
+                )
+                return
+
+        # preprocess all range indices if any
+        if None in indices_nodes:
+            all_range_tensor = torch.cartesian_prod(*all_range_index.values())
+            # repeat all_range_tensor interleavely for future concatenation
+            # e.g. input_node = [5, 4, 3, 2], indices = [index_0_node, None, index_2_node]
+            #      index_0.shape == index_2.shape == 2 (will guarantee this condition)
+            #      where user specified (3, 4) for index_0, (0, 1) for index_2
+            # ---
+            # we should have all_range_tensor: [0, 1, 2, 3]
+            # repeat interleavely with 2 to match future tiled index_0_node & index_2_node
+            # we'll have 1(index_0 -> same as index_2)*4(index_1)*2(index_2) indices in total:
+            # | index_0_node | None | index_2_node |
+            # | 3            | 0    | 0            |
+            # | 4            | 0    | 1            |
+            # | 3            | 1    | 0            |
+            # | 4            | 1    | 1            |
+            # | 3            | 2    | 0            |
+            # | 4            | 2    | 1            |
+            # | 3            | 3    | 0            |
+            # | 4            | 3    | 1            |
+            all_range_tensor_aug = all_range_tensor.repeat_interleave(
+                max_indices_in_specified_index, dim=0
+            )
+            for index in all_range_index.keys():
+                # Repeat index for "None" axis in indices_nodes
+                range_index_node = torch.fx.Node(
+                    node.graph,
+                    node.name + f"_all_range_index_{index}",
+                    "call_function",
+                    exir_ops.edge.aten.tensor.default,
+                    (),  # args
+                    {},  # kwargs
+                )
+                range_indices = (
+                    (
+                        all_range_tensor_aug[:, index]
+                        if all_range_tensor_aug.dim() > 1
+                        else
+                        # if there is only one None
+                        all_range_tensor_aug
                     )
-                    return
-        # Assume that there is only one node in list
-        assert len(index_nodes) == 1, "Not support multiple indices tensor"
-        indice_node = index_nodes[0]
-        indice_tensor = index_tensors[0]
-        indices_tensor_wrapper = self.define_tensor(
-            indice_node,
-            node,
-            indice_tensor,
-            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            nodes_to_wrappers,
-        )
+                    .reshape(-1, 1)
+                    .contiguous()
+                )
+                target_index_tensor_wrapper = self.define_tensor(
+                    range_index_node,
+                    node,
+                    range_indices,
+                    PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
+                    nodes_to_wrappers,
+                )
+                # store it for future concatenation
+                all_range_index[index] = (range_indices, target_index_tensor_wrapper)
 
         # Need to reconstruct the index tensor.
         # E.g., based on ScatterND Op Def in QNN Docs.
         # Torch:
         #   Given that
         #     shape of input: [1, 12, 1024, 64]
-        #     indicies_node: [None, None, aten__to_copy_default_1]
+        #     indices_node: [None, None, aten__to_copy_default_1]
         #     shape of aten__to_copy_default_1: [1]
         # QNN:
         #   Index tensor:
@@ -104,113 +168,135 @@ def define_node(
         #   update_indices = indices.shape[:-1]
         #   for idx in np.ndindex(update_indices):
         #       output[indices[idx]] = updates[idx]
+        specified_index = OrderedDict()
+        for i, indices_node in enumerate(indices_nodes):
+            if indices_node is None:
+                continue
 
-        # Append one dimension to specify x-tuple
-        index_shape = target_index + [1]
-        # Reshape the index_node for tile op
-        reshape_shape = [
-            shape if id == index_node_dim else 1 for id, shape in enumerate(index_shape)
-        ]
-        reshape_output_tensor = indice_tensor.reshape(reshape_shape)
-        reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-            node_name=node.name + "_reshape",
-            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-            dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype],
-            quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
-            quant_configs={},
-            dims=reshape_output_tensor.size(),
-            tensor=reshape_output_tensor,
-            is_fake_tensor=True,
-            nodes_to_wrappers=nodes_to_wrappers,
-        )
-        reshape_op = PyQnnWrapper.PyQnnOpWrapper(
-            node.name,
-            QNN_OP_PACKAGE_NAME_QTI_AISW,
-            OpReshape.op_name,
-        )
-        reshape_op.AddInputTensors([indices_tensor_wrapper])
-        reshape_op.AddOutputTensors([reshape_output_tensor_wrapper])
-        op_wrapper_list.append(reshape_op)
-        index_put_index_input_tensor_wrapper = reshape_output_tensor_wrapper
+            indices_tensor = self.get_tensor(indices_node, indices_node)
+            indices_tensor_wrapper = self.define_tensor(
+                indices_node,
+                node,
+                indices_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+            )
+            if indices_tensor.nelement() < max_indices_in_specified_index:
+                # broadcast the specified index
+                indices_tensor = indices_tensor.repeat(max_indices_in_specified_index)
+                indices_multiples = [max_indices_in_specified_index]
+                indices_multiples_shape = [len(indices_multiples)]
+                indices_tile_tensor_wrapper = self.define_custom_tensor_wrapper(
+                    node_name=node.name + f"_indices_tile_{i}",
+                    tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                    dtype=QNN_TENSOR_TYPE_MAP[indices_tensor.dtype],
+                    quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    quant_configs={},
+                    dims=indices_tensor.size(),
+                    tensor=indices_tensor,
+                    is_fake_tensor=True,
+                    nodes_to_wrappers=nodes_to_wrappers,
+                )
+                tile_op = PyQnnWrapper.PyQnnOpWrapper(
+                    node.name,
+                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                    OpTile.op_name,
+                )
+                tile_op.AddInputTensors([indices_tensor_wrapper])
+                tile_op.AddOutputTensors([indices_tile_tensor_wrapper])
+                tile_op.AddTensorParam(
+                    OpTile.param_multiples,
+                    PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                    len(indices_multiples_shape),
+                    indices_multiples_shape,
+                    np.array(indices_multiples, dtype=np.uint32),
+                    True,
+                )
+                op_wrapper_list.append(tile_op)
+                indices_tensor_wrapper = indices_tile_tensor_wrapper
 
-        # Tile the index_node and concat the target index
-        if None in indicies_node:
-            tile_output_tensor = reshape_output_tensor.expand(index_shape)
-            # Tile the index_node to align with the shape of target_index
-            # Only need to tile the dim of None axis
-            # E.g., indicies_node: [None, None, aten__to_copy_default_1]
-            # Should tile the first two dimension.
-            multiples = [
-                shape if id != index_node_dim else 1
-                for id, shape in enumerate(index_shape)
-            ]
-            multiples_shape = [len(index_shape)]
-            tile_output_tensor_wrapper = self.define_custom_tensor_wrapper(
-                node_name=node.name + "_tile",
+            # Append one dimension to specify x-tuple
+            # Reshape the index_node for tile op
+            reshape_shape = list(indices_tensor.shape) + [1]
+            reshape_output_tensor = indices_tensor.reshape(reshape_shape)
+            reshape_output_tensor_wrapper = self.define_custom_tensor_wrapper(
+                node_name=node.name + f"_reshape_{i}",
                 tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
-                dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype],
+                dtype=QNN_TENSOR_TYPE_MAP[reshape_output_tensor.dtype],
                 quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
                 quant_configs={},
-                dims=tile_output_tensor.size(),
-                tensor=tile_output_tensor,
+                dims=reshape_output_tensor.size(),
+                tensor=reshape_output_tensor,
                 is_fake_tensor=True,
                 nodes_to_wrappers=nodes_to_wrappers,
             )
-            tile_op = PyQnnWrapper.PyQnnOpWrapper(
+            reshape_op = PyQnnWrapper.PyQnnOpWrapper(
                 node.name,
                 QNN_OP_PACKAGE_NAME_QTI_AISW,
-                OpTile.op_name,
+                OpReshape.op_name,
             )
-            tile_op.AddInputTensors([reshape_output_tensor_wrapper])
-            tile_op.AddOutputTensors([tile_output_tensor_wrapper])
-            tile_op.AddTensorParam(
-                OpTile.param_multiples,
-                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
-                len(multiples_shape),
-                multiples_shape,
-                np.array(multiples, dtype=np.uint32),
-                True,
-            )
-            op_wrapper_list.append(tile_op)
+            reshape_op.AddInputTensors([indices_tensor_wrapper])
+            reshape_op.AddOutputTensors([reshape_output_tensor_wrapper])
+            op_wrapper_list.append(reshape_op)
+            index_tensor_wrapper = reshape_output_tensor_wrapper
+            index_tensor = reshape_output_tensor
 
-            # Repeat index for "None" axis in indicies_node
-            ranges = [
-                torch.arange(dim, dtype=indice_tensor.dtype)
-                for dim in target_index[:-1]
-            ]
-            target_index_shape = target_index + [len(ranges)]
-            target_index_tensor = torch.cartesian_prod(*ranges)
-            reshape_target_index_shape = [
-                shape if id != index_node_dim else 1
-                for id, shape in enumerate(target_index_shape)
-            ]
-            target_index_tensor = target_index_tensor.reshape(
-                reshape_target_index_shape
-            )
-            target_index_tensor = target_index_tensor.expand(
-                target_index_shape
-            ).contiguous()
-            target_index_node = torch.fx.Node(
-                node.graph,
-                node.name + "_target_index",
-                "call_function",
-                exir_ops.edge.aten.tensor.default,
-                (),  # args
-                {},  # kwargs
-            )
-            target_index_tensor_wrapper = self.define_tensor(
-                target_index_node,
-                node,
-                target_index_tensor,
-                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
-                nodes_to_wrappers,
-            )
+            # Tile the index_node and concat the target index
+            if None in indices_nodes:
+                tile_output_tensor = reshape_output_tensor.repeat(
+                    all_range_tensor.size(0), 1
+                )
+                # Tile the index_node to align with the shape of target_index
+                # Only need to tile the dim of None axis
+                # E.g., indices_node: [None, None, aten__to_copy_default_1]
+                # Should tile the number of indices combination of first two dimension
+                # times number of indices specified by aten__to_copy_default_1
+                multiples = [all_range_tensor.size(0), 1]
+                multiples_shape = [len(multiples)]
+                tile_output_tensor_wrapper = self.define_custom_tensor_wrapper(
+                    node_name=node.name + f"_tile_{i}",
+                    tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                    dtype=QNN_TENSOR_TYPE_MAP[tile_output_tensor.dtype],
+                    quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    quant_configs={},
+                    dims=tile_output_tensor.size(),
+                    tensor=tile_output_tensor,
+                    is_fake_tensor=True,
+                    nodes_to_wrappers=nodes_to_wrappers,
+                )
+                tile_op = PyQnnWrapper.PyQnnOpWrapper(
+                    node.name,
+                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                    OpTile.op_name,
+                )
+                tile_op.AddInputTensors([reshape_output_tensor_wrapper])
+                tile_op.AddOutputTensors([tile_output_tensor_wrapper])
+                tile_op.AddTensorParam(
+                    OpTile.param_multiples,
+                    PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                    len(multiples_shape),
+                    multiples_shape,
+                    np.array(multiples, dtype=np.uint32),
+                    True,
+                )
+                op_wrapper_list.append(tile_op)
+                index_tensor_wrapper = tile_output_tensor_wrapper
+                index_tensor = tile_output_tensor
 
-            # Concat target_index and tile output to reconstruct index_node
-            # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype
-            concat_output_tensor = torch.concat(
-                (target_index_tensor, tile_output_tensor), dim=-1
+            specified_index[i] = (index_tensor, index_tensor_wrapper)
+
+        # Concat target_index and tile output to reconstruct index_node
+        # Cannot use QNN Pack (stack) since QNN Pack is not support int32 dtype
+        index_tensors, index_tensor_wrappers = [], []
+        for i, arg in enumerate(indices_nodes):
+            tensor, tensor_wrapper = (
+                all_range_index[i] if arg is None else specified_index[i]
             )
+            index_tensors.append(tensor)
+            index_tensor_wrappers.append(tensor_wrapper)
+
+        if len(index_tensor_wrappers) > 1:
+            concat_output_tensor = torch.concat(index_tensors, dim=-1)
             concat_output_tensor_wrapper = self.define_custom_tensor_wrapper(
                 node_name=node.name + "_concat",
                 tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
@@ -227,9 +313,7 @@ def define_node(
                 QNN_OP_PACKAGE_NAME_QTI_AISW,
                 OpConcat.op_name,
             )
-            concat_op.AddInputTensors(
-                [target_index_tensor_wrapper, tile_output_tensor_wrapper]
-            )
+            concat_op.AddInputTensors(index_tensor_wrappers)
             concat_op.AddOutputTensors([concat_output_tensor_wrapper])
             concat_op.AddScalarParam(
                 OpConcat.param_axis,
@@ -237,7 +321,6 @@ def define_node(
                 {QCOM_DATA: np.uint32(concat_output_tensor.dim() - 1)},
             )
             op_wrapper_list.append(concat_op)
-            index_put_index_input_tensor_wrapper = concat_output_tensor_wrapper
 
         value_node = self.get_node(node.args[2])
         value_tensor = self.get_tensor(value_node, node)
@@ -248,6 +331,94 @@ def define_node(
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
             nodes_to_wrappers,
         )
+        # handle broadcast scenario
+        # e.g. input_tensor: (1, 12, 1024, 64), value_tensor: (1, 64)
+        #      => value_reshape_tensor: (1, 1, 1, 64)
+        new_value_shape = (
+            *([1] * (input_tensor.dim() - value_tensor.dim())),
+            *value_tensor.shape,
+        )
+        # reshape the value_node for tile op
+        value_quant_encoding, value_quant_configs = self.get_quant_encoding_conf(
+            value_node, node
+        )
+        value_dtype = (
+            QNN_TENSOR_TYPE_MAP[value_tensor.dtype]
+            if value_quant_encoding
+            == PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED
+            else QNN_QUANT_TYPE_MAP[
+                (
+                    torch.uint16
+                    if value_quant_configs[QCOM_DTYPE] == torch.int32
+                    else value_quant_configs[QCOM_DTYPE]
+                )
+            ]
+        )
+        value_reshape_tensor = value_tensor.reshape(new_value_shape)
+        value_reshape_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_value_reshape",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=value_dtype,
+            quant_encoding=value_quant_encoding,
+            quant_configs=value_quant_configs,
+            dims=value_reshape_tensor.size(),
+            tensor=value_reshape_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        value_reshape_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        value_reshape_op.AddInputTensors([value_tensor_wrapper])
+        value_reshape_op.AddOutputTensors([value_reshape_tensor_wrapper])
+        op_wrapper_list.append(value_reshape_op)
+
+        # e.g. input_tensor: (1, 12, 1024, 64), index_tensor: (None, None, 2), value_tensor: (1, 64)
+        #      => multiples: [1, 12, 2, 1]
+        value_multiples = []
+        for i in range(input_tensor.dim() - 1, -1, -1):
+            if i in specified_index:
+                # all user specified index node wil have the same dimension
+                multiplier = (
+                    indices_nodes[i].meta["val"].nelement() // new_value_shape[i]
+                    if i == last_specified_index_node
+                    else 1
+                )
+            else:
+                multiplier = input_tensor.shape[i] // new_value_shape[i]
+            value_multiples.insert(0, multiplier)
+
+        value_tile_tensor = value_reshape_tensor.repeat(value_multiples)
+        value_multiples_shape = [len(value_multiples)]
+        value_tile_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_value_tile",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=value_dtype,
+            quant_encoding=value_quant_encoding,
+            quant_configs=value_quant_configs,
+            dims=value_tile_tensor.size(),
+            tensor=value_tile_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        value_tile_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpTile.op_name,
+        )
+        value_tile_op.AddInputTensors([value_reshape_tensor_wrapper])
+        value_tile_op.AddOutputTensors([value_tile_tensor_wrapper])
+        value_tile_op.AddTensorParam(
+            OpTile.param_multiples,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(value_multiples_shape),
+            value_multiples_shape,
+            np.array(value_multiples, dtype=np.uint32),
+            True,
+        )
+        op_wrapper_list.append(value_tile_op)
 
         output_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
@@ -263,11 +434,46 @@ def define_node(
             QNN_OP_PACKAGE_NAME_QTI_AISW,
             OpScatterNd.op_name,
         )
+        # accumulation
+        if len(node.args) > 3 and node.args[3]:
+            index_put_op.AddScalarParam(
+                OpScatterNd.param_reduction,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+                {QCOM_DATA: 1},
+            )
+
+        # check final index_input tensor
+        index_input_tensor, index_input_tensor_wrapper = (
+            (concat_output_tensor, concat_output_tensor_wrapper)
+            if len(index_tensor_wrappers) > 1
+            else specified_index[last_specified_index_node]
+        )
+        target_index_reshape_tensor = index_input_tensor.reshape((*target_index, -1))
+        target_index_reshape_tensor_wrapper = self.define_custom_tensor_wrapper(
+            node_name=node.name + "_target_index_reshape",
+            tensor_type=PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            dtype=QNN_TENSOR_TYPE_MAP[target_index_reshape_tensor.dtype],
+            quant_encoding=PyQnnWrapper.Qnn_QuantizationEncoding_t.QNN_QUANTIZATION_ENCODING_UNDEFINED,
+            quant_configs={},
+            dims=target_index_reshape_tensor.size(),
+            tensor=target_index_reshape_tensor,
+            is_fake_tensor=True,
+            nodes_to_wrappers=nodes_to_wrappers,
+        )
+        target_index_reshape_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpReshape.op_name,
+        )
+        target_index_reshape_op.AddInputTensors([index_input_tensor_wrapper])
+        target_index_reshape_op.AddOutputTensors([target_index_reshape_tensor_wrapper])
+        op_wrapper_list.append(target_index_reshape_op)
+
         index_put_op.AddInputTensors(
             [
                 input_tensor_wrapper,
-                index_put_index_input_tensor_wrapper,
-                value_tensor_wrapper,
+                target_index_reshape_tensor_wrapper,
+                value_tile_tensor_wrapper,
             ]
         )
         index_put_op.AddOutputTensors([output_tensor_wrapper])
diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
index 630b1b0b8de..10644e17c79 100644
--- a/backends/qualcomm/builders/op_mean_dim.py
+++ b/backends/qualcomm/builders/op_mean_dim.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import cast, Dict, List
+from typing import cast, Dict
 
 import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
 
@@ -40,7 +40,22 @@ def define_node(
         )
 
         # mean dims and keep dims
-        mean_dims = cast(List[int], node.args[1])
+        rank = len(input_node.meta["val"].shape)
+
+        if rank == 0:
+            raise RuntimeError(
+                "Mean doesn't support 0d input, please report a bug in https://github.com/pytorch/executorch/issues"
+            )
+
+        dim_arg = node.args[1]
+
+        if dim_arg is None or len(dim_arg) == 0:
+            mean_dims = list(range(rank))  # reduce over all dims
+        elif isinstance(dim_arg, int):
+            mean_dims = [dim_arg]
+        else:
+            mean_dims = list(dim_arg)
+
         mean_dims = [
             mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
         ]
diff --git a/backends/qualcomm/builders/op_transpose.py b/backends/qualcomm/builders/op_transpose.py
index dbed10ced46..e7fd84e8e79 100644
--- a/backends/qualcomm/builders/op_transpose.py
+++ b/backends/qualcomm/builders/op_transpose.py
@@ -42,6 +42,8 @@ def define_node(
 
         # permutation
         permute_order = cast(List[int], node.args[1])
+        # to prevent negative values
+        permute_order = [x % len(permute_order) for x in permute_order]
         permute_order_shape = [len(permute_order)]
 
         output_tensor = input_tensor.permute(permute_order)
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
index b0c44dcae80..79a1c93d50c 100644
--- a/backends/qualcomm/builders/qnn_constants.py
+++ b/backends/qualcomm/builders/qnn_constants.py
@@ -59,6 +59,15 @@ class OpConv2d:
     param_dilation: str = "dilation"
 
 
+@dataclass(init=False, frozen=True)
+class OpConv3d:
+    op_name: str = "Conv3d"
+    param_stride: str = "stride"
+    param_pad_amount: str = "pad_amount"
+    param_group: str = "group"
+    param_dilation: str = "dilation"
+
+
 @dataclass(init=False, frozen=True)
 class OpConvert:
     op_name: str = "Convert"
@@ -573,6 +582,15 @@ class OpTransposeConv2d:
     param_output_padding: str = "output_padding"
 
 
+@dataclass(init=False, frozen=True)
+class OpTransposeConv3d:
+    op_name: str = "TransposeConv3d"
+    param_stride: str = "stride"
+    param_pad_amount: str = "pad_amount"
+    param_group: str = "group"
+    param_output_padding: str = "output_padding"
+
+
 @dataclass(init=False, frozen=True)
 class OpUnpack:
     op_name: str = "UnPack"
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
index 7a2924fe756..0a947759538 100644
--- a/backends/qualcomm/partition/common_defs.py
+++ b/backends/qualcomm/partition/common_defs.py
@@ -17,6 +17,7 @@
 to_be_implemented_operator = [
     exir_ops.edge.aten._adaptive_avg_pool3d.default,
     exir_ops.edge.aten.adaptive_max_pool2d.default,
+    exir_ops.edge.aten.adaptive_max_pool3d.default,
     exir_ops.edge.aten.avg_pool3d.default,
     exir_ops.edge.aten.div.Tensor_mode,
     exir_ops.edge.aten.log10.default,
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
index 88109b51697..cf403a1a76d 100644
--- a/backends/qualcomm/quantizer/annotators.py
+++ b/backends/qualcomm/quantizer/annotators.py
@@ -68,7 +68,7 @@ def _is_float_tensor(node: Node):
         or not isinstance(node.meta["val"], FakeTensor)
     ):
         return False
-    return node.meta["val"].dtype == torch.float32
+    return node.meta["val"].dtype in (torch.bfloat16, torch.float32)
 
 
 def _mark_nodes_as_annotated(nodes: List[Node]):
@@ -674,7 +674,7 @@ def annotate_pad(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.reshape.default])
+@register_annotator([torch.ops.aten.reshape.default, torch.ops.aten.unflatten.int])
 def annotate_reshape(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
@@ -879,7 +879,7 @@ def annotate_unsqueeze_copy(
         annotate_single_in_share_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.transpose.int])
+@register_annotator([torch.ops.aten.transpose.int, torch.ops.aten.swapaxes.default])
 def annotate_transpose(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_in_out_obs_sharing_op(node, quantization_config)
     if not _is_annotated([node]):
@@ -1094,11 +1094,13 @@ def annotate_cdist(node: Node, quantization_config: QuantizationConfig) -> None:
 
 @register_annotator(
     [
+        torch.ops.aten.conv1d.default,
         torch.ops.aten.conv2d.default,
         torch.ops.aten.conv2d.padding,
-        torch.ops.aten.conv1d.default,
-        torch.ops.aten.conv_transpose2d.input,
+        torch.ops.aten.conv3d.default,
         torch.ops.aten.conv_transpose1d.default,
+        torch.ops.aten.conv_transpose2d.input,
+        torch.ops.aten.conv_transpose3d.input,
         torch.ops.aten.convolution.default,
     ]
 )
@@ -1356,7 +1358,7 @@ def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None:
         )
 
 
-@register_annotator([torch.ops.aten.where.self])
+@register_annotator([torch.ops.aten.where.self, torch.ops.aten.where.ScalarSelf])
 def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
@@ -1366,7 +1368,6 @@ def annotate_where(node: Node, quantization_config: QuantizationConfig) -> None:
         assert isinstance(input_node, Node)
         if _is_float_tensor(input_node):
             input_qspec_map[input_node] = quantization_config.input_activation
-
     node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
         input_qspec_map=input_qspec_map,
         output_qspec=(
diff --git a/backends/qualcomm/quantizer/qconfig.py b/backends/qualcomm/quantizer/qconfig.py
index 2f26cd27d31..694fab3dc6b 100644
--- a/backends/qualcomm/quantizer/qconfig.py
+++ b/backends/qualcomm/quantizer/qconfig.py
@@ -200,12 +200,11 @@ def get_16a8w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
     extra_args: Dict[str, Any] = {"eps": 2**-20}
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
         qscheme=torch.per_tensor_affine,
-        reduce_range=True,
         observer=act_observer.with_args(**extra_args),
     )
     act_quantization_spec = QuantizationSpec(
@@ -220,7 +219,6 @@ def get_16a8w_qnn_qat_config(
         quant_min=torch.iinfo(torch.int8).min + 1,
         quant_max=torch.iinfo(torch.int8).max,
         qscheme=torch.per_tensor_symmetric,
-        reduce_range=True,
         observer=MovingAverageMinMaxObserver,
     )
     weight_quantization_spec = QuantizationSpec(
@@ -400,7 +398,7 @@ def get_ptq_per_block_quant_config(
 def get_8a8w_qnn_qat_config(
     act_symmetric: bool = False, act_observer=MovingAverageMinMaxObserver
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.uint8,
         qscheme=(
             torch.per_tensor_symmetric if act_symmetric else torch.per_tensor_affine
@@ -421,7 +419,6 @@ def get_8a8w_qnn_qat_config(
         quant_min=torch.iinfo(torch.int8).min + 1,
         quant_max=torch.iinfo(torch.int8).max,
         qscheme=torch.per_tensor_symmetric,
-        reduce_range=True,
         observer=MovingAverageMinMaxObserver,
     )
     weight_quantization_spec = QuantizationSpec(
@@ -438,7 +435,6 @@ def get_8a8w_qnn_qat_config(
         quant_min=torch.iinfo(torch.int32).min,
         quant_max=torch.iinfo(torch.int32).max,
         qscheme=torch.per_tensor_symmetric,
-        reduce_range=True,
         observer=MovingAverageMinMaxObserver,
     )
     bias_quantization_spec = QuantizationSpec(
@@ -462,12 +458,11 @@ def get_8a8w_qnn_qat_config(
 def get_16a4w_qnn_qat_config(
     act_observer=MovingAverageMinMaxObserver,
 ) -> QuantizationConfig:
-    act_fake_quant_ctr = FakeQuantize.with_args(
+    act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
         dtype=torch.int32,
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
         qscheme=torch.per_tensor_affine,
-        reduce_range=True,
         observer=act_observer,
     )
     act_quantization_spec = QuantizationSpec(
@@ -484,7 +479,6 @@ def get_16a4w_qnn_qat_config(
         quant_max=7,
         qscheme=torch.per_tensor_symmetric,
         ch_axis=0,
-        reduce_range=True,
         observer=MovingAverageMinMaxObserver,
     )
     weight_quantization_spec = QuantizationSpec(
@@ -501,7 +495,6 @@ def get_16a4w_qnn_qat_config(
         quant_min=torch.iinfo(torch.int32).min,
         quant_max=torch.iinfo(torch.int32).max,
         qscheme=torch.per_tensor_symmetric,
-        reduce_range=True,
         observer=MovingAverageMinMaxObserver,
     )
     bias_quantization_spec = QuantizationSpec(
@@ -548,10 +541,9 @@ def get_qat_per_channel_quant_config(
         # If zero_point is 128, htp can do optimizations.
         # If we keep quant_min and quant_max none, observer will default use 128 as zero_point.
         # If we provide uint8 quant_min/max, it will use 127 as zero_point, which is undesired.
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             qscheme=torch.per_tensor_symmetric,
-            reduce_range=True,
             observer=act_observer,
         )
         act_quantization_spec = QuantizationSpec(
@@ -561,12 +553,11 @@ def get_qat_per_channel_quant_config(
             observer_or_fake_quant_ctr=act_fake_quant_ctr,
         )
     else:
-        act_fake_quant_ctr = FakeQuantize.with_args(
+        act_fake_quant_ctr = FusedMovingAvgObsFakeQuantize.with_args(
             dtype=torch.int32 if act_dtype == torch.uint16 else act_dtype,
             quant_min=torch.iinfo(act_dtype).min,
             quant_max=torch.iinfo(act_dtype).max,
             qscheme=torch.per_tensor_affine,
-            reduce_range=True,
             observer=act_observer,
         )
         act_quantization_spec = QuantizationSpec(
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
index 5943b54d968..44d129d5544 100644
--- a/backends/qualcomm/quantizer/quantizer.py
+++ b/backends/qualcomm/quantizer/quantizer.py
@@ -161,6 +161,7 @@ def __post_init__(self):
                 {
                     torch.ops.aten.conv1d.default,
                     torch.ops.aten.conv2d.default,
+                    torch.ops.aten.conv3d.default,
                     torch.ops.aten.conv_transpose2d.input,
                 }
             )
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
index c84911cf851..4cdd1efe6f4 100755
--- a/backends/qualcomm/scripts/build.sh
+++ b/backends/qualcomm/scripts/build.sh
@@ -86,6 +86,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
@@ -155,6 +156,7 @@ if [ "$BUILD_X86_64" = true ]; then
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
diff --git a/backends/qualcomm/scripts/download_qnn_sdk.py b/backends/qualcomm/scripts/download_qnn_sdk.py
index 35006a41433..747524a0e5b 100644
--- a/backends/qualcomm/scripts/download_qnn_sdk.py
+++ b/backends/qualcomm/scripts/download_qnn_sdk.py
@@ -6,12 +6,15 @@
 import platform
 import re
 import shutil
+import subprocess
+import sys
 import tarfile
 import tempfile
 import urllib.request
 import zipfile
 from typing import Dict, List, Optional, Tuple
 
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
@@ -34,68 +37,81 @@ def is_linux_x86() -> bool:
     )
 
 
-import subprocess
+#########################
+# Cache directory helper
+#########################
 
-MINIMUM_LIBC_VERSION = 2.29
+APP_NAMESPACE = ["executorch", "qnn"]
 
-REQUIRED_LIBC_LIBS = [
-    "/lib/x86_64-linux-gnu/libc.so.6",
-    "/lib64/libc.so.6",
-    "/lib/libc.so.6",
-]
 
+def _get_staging_dir(*parts: str) -> pathlib.Path:
+    r"""
+    Return a cross-platform staging directory for staging SDKs/libraries.
+
+    - On Linux:
+        ~/.cache/executorch/qnn/<parts...>
+        (falls back to $HOME/.cache if $XDG_CACHE_HOME is unset)
 
-def check_glibc_exist_and_validate() -> bool:
+    - On Windows (not supported yet, but as placeholder):
+        %LOCALAPPDATA%\executorch\qnn\<parts...>
+        (falls back to $HOME/AppData/Local if %LOCALAPPDATA% is unset)
+
+    - Override:
+        If QNN_STAGING_DIR is set in the environment, that path is used instead.
+
+    Args:
+        parts (str): Subdirectories to append under the root staging dir.
+
+    Returns:
+        pathlib.Path: Fully qualified staging path.
     """
-    Check if users have glibc installed.
+    # Environment override wins
+    base = os.environ.get("QNN_STAGING_DIR")
+    if base:
+        return pathlib.Path(base).joinpath(*parts)
+
+    system = platform.system().lower()
+    if system == "windows":
+        # On Windows, prefer %LOCALAPPDATA%, fallback to ~/AppData/Local
+        base = pathlib.Path(
+            os.environ.get("LOCALAPPDATA", pathlib.Path.home() / "AppData" / "Local")
+        )
+    elif is_linux_x86():
+        # On Linux/Unix, prefer $XDG_CACHE_HOME, fallback to ~/.cache
+        base = pathlib.Path(
+            os.environ.get("XDG_CACHE_HOME", pathlib.Path.home() / ".cache")
+        )
+    else:
+        raise ValueError(f"Unsupported platform: {system}")
+
+    return base.joinpath(*APP_NAMESPACE, *parts)
+
+
+def _atomic_download(url: str, dest: pathlib.Path):
     """
-    exists = False
-    for path in REQUIRED_LIBC_LIBS:
-        try:
-            output = subprocess.check_output(
-                [path, "--version"], stderr=subprocess.STDOUT
-            )
-            output = output.decode().split("\n")[0]
-            logger.debug(f"[QNN] glibc version for path {path} is: {output}")
-            match = re.search(r"version (\d+\.\d+)", output)
-            if match:
-                version = match.group(1)
-                if float(version) >= MINIMUM_LIBC_VERSION:
-                    logger.debug(f"[QNN] glibc version is {version}.")
-                    exists = True
-                    return True
-                else:
-                    logger.error(
-                        f"[QNN] glibc version is too low. The minimum libc version is {MINIMUM_LIBC_VERSION} Please install glibc following the commands below."
-                    )
-            else:
-                logger.error("[QNN] glibc version not found.")
+    Download URL into dest atomically:
+      - Write to a temp file in the same dir
+      - Move into place if successful
+    """
+    dest.parent.mkdir(parents=True, exist_ok=True)
 
-        except Exception:
-            continue
+    # Temp file in same dir (guarantees atomic rename)
+    with tempfile.NamedTemporaryFile(dir=dest.parent, delete=False) as tmp:
+        tmp_path = pathlib.Path(tmp.name)
 
-    if not exists:
-        logger.error(
-            r""""
-            [QNN] glibc not found or the version is too low. Please install glibc following the commands below.
-            Ubuntu/Debian:
-                sudo apt update
-                sudo apt install libc6
-
-            Fedora/Red Hat:
-                sudo dnf install glibc
-
-            Arch Linux:
-                sudo pacman -S glibc
-            
-            Also please make sure the glibc version is >= MINIMUM_LIBC_VERSION. You can verify the glibc version by running the following command:
-            Option 1:
-                ldd --version
-            Option 2:
-                /path/to/libc.so.6 --version
-            """
-        )
-    return exists
+    try:
+        urllib.request.urlretrieve(url, tmp_path)
+        tmp_path.replace(dest)  # atomic rename
+    except Exception:
+        # Clean up partial file on failure
+        if tmp_path.exists():
+            tmp_path.unlink(missing_ok=True)
+        raise
+
+
+####################
+# qnn sdk download management
+####################
 
 
 def _download_archive(url: str, archive_path: pathlib.Path) -> bool:
@@ -178,9 +194,6 @@ def _download_qnn_sdk(dst_folder=SDK_DIR) -> Optional[pathlib.Path]:
     if not is_linux_x86():
         logger.info("[QNN] Skipping Qualcomm SDK (only supported on Linux x86).")
         return None
-    elif not check_glibc_exist_and_validate():
-        logger.info("[QNN] Skipping Qualcomm SDK (glibc not found or version too old).")
-        return None
     else:
         logger.info("[QNN] Downloading Qualcomm SDK for Linux x86")
 
@@ -241,6 +254,136 @@ def _extract_tar(archive_path: pathlib.Path, prefix: str, target_dir: pathlib.Pa
                     dst.write(src.read())
 
 
+####################
+# libc management
+####################
+
+GLIBC_VERSION = "2.34"
+GLIBC_REEXEC_GUARD = "QNN_GLIBC_REEXEC"
+MINIMUM_LIBC_VERSION = GLIBC_VERSION
+
+
+def _get_glibc_libdir() -> pathlib.Path:
+    glibc_root = _get_staging_dir(f"glibc-{GLIBC_VERSION}")
+    return glibc_root / "lib"
+
+
+def _parse_version(v: str) -> tuple[int, int]:
+    """Turn '2.34' → (2,34) so it can be compared."""
+    parts = v.split(".")
+    return int(parts[0]), int(parts[1]) if len(parts) > 1 else 0
+
+
+def _current_glibc_version() -> str:
+    """Return system glibc version string (via ctypes)."""
+    try:
+        libc = ctypes.CDLL("libc.so.6")
+        func = libc.gnu_get_libc_version
+        func.restype = ctypes.c_char_p
+        return func().decode()
+    except Exception as e:
+        return f"error:{e}"
+
+
+def _resolve_glibc_loader() -> pathlib.Path | None:
+    """Return staged ld.so path if available."""
+    for p in [
+        _get_glibc_libdir() / f"ld-{GLIBC_VERSION}.so",
+        _get_glibc_libdir() / "ld-linux-x86-64.so.2",
+    ]:
+        if p.exists():
+            return p
+    return None
+
+
+def _stage_prebuilt_glibc():
+    """Download + extract Fedora 35 glibc RPM into /tmp."""
+    logger.info(">>> Staging prebuilt glibc-%s from Fedora 35 RPM", GLIBC_VERSION)
+    _get_glibc_libdir().mkdir(parents=True, exist_ok=True)
+    rpm_path = _get_staging_dir("glibc") / "glibc.rpm"
+    work_dir = _get_staging_dir("glibc") / "extracted"
+    rpm_url = (
+        "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/35/"
+        "Everything/x86_64/os/Packages/g/glibc-2.34-7.fc35.x86_64.rpm"
+    )
+
+    rpm_path.parent.mkdir(parents=True, exist_ok=True)
+    logger.info("[glibc] Downloading %s -> %s", rpm_url, rpm_path)
+    try:
+        urllib.request.urlretrieve(rpm_url, rpm_path)
+    except Exception as e:
+        logger.error("[glibc] Failed to download %s: %s", rpm_url, e)
+        raise
+
+    # Extract
+    if work_dir.exists():
+        shutil.rmtree(work_dir)
+    work_dir.mkdir(parents=True)
+    subprocess.check_call(["bsdtar", "-C", str(work_dir), "-xf", str(rpm_path)])
+
+    # Copy runtime libs
+    staged = [
+        "ld-linux-x86-64.so.2",
+        "libc.so.6",
+        "libdl.so.2",
+        "libpthread.so.0",
+        "librt.so.1",
+        "libm.so.6",
+        "libutil.so.1",
+    ]
+    for lib in staged:
+        src = work_dir / "lib64" / lib
+        if src.exists():
+            shutil.copy2(src, _get_glibc_libdir() / lib)
+            logger.info("[glibc] Staged %s", lib)
+        else:
+            logger.warning("[glibc] Missing %s in RPM", lib)
+
+
+def ensure_glibc_minimum(min_version: str = GLIBC_VERSION):
+    """
+    Ensure process runs under glibc >= min_version.
+    - If system glibc is new enough → skip.
+    - Else → stage Fedora RPM and re-exec under staged loader.
+    """
+    current = _current_glibc_version()
+    logger.info("[glibc] Current loaded glibc: %s", current)
+
+    # If system glibc already sufficient → skip everything
+    m = re.match(r"(\d+\.\d+)", current)
+    if m and _parse_version(m.group(1)) >= _parse_version(min_version):
+        logger.info("[glibc] System glibc >= %s, no staging needed.", min_version)
+        return
+
+    # Avoid infinite loop
+    if os.environ.get(GLIBC_REEXEC_GUARD) == "1":
+        logger.info("[glibc] Already re-exec'd once, continuing.")
+        return
+
+    # Stage prebuilt if not already staged
+    if not (_get_glibc_libdir() / "libc.so.6").exists():
+        _stage_prebuilt_glibc()
+
+    loader = _resolve_glibc_loader()
+    if not loader:
+        logger.error("[glibc] Loader not found in %s", _get_glibc_libdir())
+        return
+
+    logger.info(
+        "[glibc] Re-execing under loader %s with libdir %s", loader, _get_glibc_libdir()
+    )
+    os.environ[GLIBC_REEXEC_GUARD] = "1"
+    os.execv(
+        str(loader),
+        [str(loader), "--library-path", str(_get_glibc_libdir()), sys.executable]
+        + sys.argv,
+    )
+
+
+####################
+# libc++ management
+####################
+
 LLVM_VERSION = "14.0.0"
 LIBCXX_BASE_NAME = f"clang+llvm-{LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04"
 LLVM_URL = f"https://github.com/llvm/llvm-project/releases/download/llvmorg-{LLVM_VERSION}/{LIBCXX_BASE_NAME}.tar.xz"
@@ -258,12 +401,17 @@ def _stage_libcxx(target_dir: pathlib.Path):
         logger.info("[libcxx] Already staged at %s, skipping download", target_dir)
         return
 
-    temp_tar = pathlib.Path("/tmp") / f"{LIBCXX_BASE_NAME}.tar.xz"
-    temp_extract = pathlib.Path("/tmp") / LIBCXX_BASE_NAME
+    libcxx_stage = _get_staging_dir(f"libcxx-{LLVM_VERSION}")
+    temp_tar = libcxx_stage / f"{LIBCXX_BASE_NAME}.tar.xz"
+    temp_extract = libcxx_stage / LIBCXX_BASE_NAME
 
     if not temp_tar.exists():
         logger.info("[libcxx] Downloading %s", LLVM_URL)
-        urllib.request.urlretrieve(LLVM_URL, temp_tar)
+        _atomic_download(LLVM_URL, temp_tar)
+
+    # Sanity check before extracting
+    if not temp_tar.exists() or temp_tar.stat().st_size == 0:
+        raise FileNotFoundError(f"[libcxx] Tarball missing or empty: {temp_tar}")
 
     logger.info("[libcxx] Extracting %s", temp_tar)
     with tarfile.open(temp_tar, "r:xz") as tar:
@@ -437,8 +585,10 @@ def install_qnn_sdk() -> bool:
     Returns:
         True if both steps succeeded (or were already satisfied), else False.
     """
-    if check_glibc_exist_and_validate():
-        if _ensure_libcxx_stack():
-            if _ensure_qnn_sdk_lib():
-                return True
-    return False
+    logger.info("[QNN] Starting SDK installation")
+
+    # Make sure we’re running under >= 2.34
+    ensure_glibc_minimum(GLIBC_VERSION)
+
+    # libc++ and QNN SDK setup
+    return _ensure_libcxx_stack() and _ensure_qnn_sdk_lib()
diff --git a/backends/qualcomm/tests/TARGETS b/backends/qualcomm/tests/TARGETS
index 639303c7eb8..d968f954485 100644
--- a/backends/qualcomm/tests/TARGETS
+++ b/backends/qualcomm/tests/TARGETS
@@ -47,3 +47,17 @@ runtime.python_library(
         ":test_qnn_delegate"
     ]
 )
+
+runtime.python_test(
+    name = "test_passes",
+    srcs = [
+        "test_passes.py",
+    ],
+    deps = [
+        "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
+        "//caffe2:torch",
+        "//executorch/exir:lib",
+        "//executorch/backends/qualcomm/_passes:passes",
+        "//executorch/backends/qualcomm/builders:builders",
+    ],
+)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index 77ff1be4562..5ea6caf54ad 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -4,8 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import torch
+from typing import List, Optional, Tuple, Union
 
+import torch
 
 # module with related operator only
 
@@ -66,6 +67,28 @@ def forward(self, x, y):
         return torch.add(x, y)
 
 
+class AddAlpha(torch.nn.Module):
+    def __init__(self, alpha):
+        super().__init__()
+        self.alpha = alpha
+
+    def forward(self, x, y):
+        return torch.add(x, y, alpha=self.alpha)
+
+
+class AddAlphaConstant(torch.nn.Module):
+    def __init__(self, alpha, constant_first=False):
+        super().__init__()
+        self.alpha = alpha
+        self.constant_first = constant_first
+
+    def forward(self, x):
+        if self.constant_first:
+            return torch.add(5.0, x, alpha=self.alpha)
+        else:
+            return torch.add(x, 5.0, alpha=self.alpha)
+
+
 class AddConstantFloat(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -148,21 +171,23 @@ def forward(self, y):
 
 
 class Argmax(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, dim: Optional[int] = None, keepdim: bool = False):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, x):
-        x = torch.argmax(x, dim=0, keepdim=True)
-        return x
+        return torch.argmax(x, dim=self.dim, keepdim=self.keepdim)
 
 
 class Argmin(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, dim: Optional[int] = None, keepdim: bool = False):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
 
     def forward(self, x):
-        x = torch.argmin(x, dim=0, keepdim=True)
-        return x
+        return torch.argmin(x, dim=self.dim, keepdim=self.keepdim)
 
 
 class ArgminViewSqueezeConv2D(torch.nn.Module):
@@ -274,6 +299,15 @@ def forward(self, x, y):
         return torch.cat((y, y, x, x), axis=2)
 
 
+class Cat5(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.const_tensor = torch.randn(1, 1, 2, 2)
+
+    def forward(self, x, y):
+        return torch.cat((x, y, self.const_tensor), axis=2)
+
+
 class CausalMask(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -588,40 +622,6 @@ def forward(self, x):
         return self.conv(x)
 
 
-class ConvTranspose1dSingle(torch.nn.Module):
-    def __init__(self, bias=True, dilation=1):
-        super().__init__()
-        self.conv_transpose = torch.nn.ConvTranspose1d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            dilation=dilation,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        return self.conv_transpose(x)
-
-
-class ConvTranspose2dSingle(torch.nn.Module):
-    def __init__(self, bias=True, dilation=1):
-        super().__init__()
-        self.conv_transpose = torch.nn.ConvTranspose2d(
-            in_channels=1,
-            out_channels=3,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            dilation=dilation,
-            bias=bias,
-        )
-
-    def forward(self, x):
-        return self.conv_transpose(x)
-
-
 class Conv2dDownUpSample(torch.nn.Module):
     def __init__(self, bias=True):
         super().__init__()
@@ -706,6 +706,79 @@ def forward(self, x):
         return topk_values
 
 
+class Conv3dSequential(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.first = torch.nn.Conv3d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=bias,
+        )
+        self.second = torch.nn.Conv3d(
+            in_channels=3,
+            out_channels=2,
+            kernel_size=(3, 3, 3),
+            padding=1,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.second(self.first(x))
+
+
+class ConvTranspose1dSingle(torch.nn.Module):
+    def __init__(self, bias=True, dilation=1):
+        super().__init__()
+        self.conv_transpose = torch.nn.ConvTranspose1d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(x)
+
+
+class ConvTranspose2dSingle(torch.nn.Module):
+    def __init__(self, bias=True, dilation=1):
+        super().__init__()
+        self.conv_transpose = torch.nn.ConvTranspose2d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(x)
+
+
+class ConvTranspose3dSingle(torch.nn.Module):
+    def __init__(self, bias=True, dilation=1):
+        super().__init__()
+        self.conv_transpose = torch.nn.ConvTranspose3d(
+            in_channels=1,
+            out_channels=3,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            dilation=dilation,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        return self.conv_transpose(x)
+
+
 class Cos(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1068,20 +1141,62 @@ def forward(self, input_pos, k_val):
 
 
 class IndexPut(torch.nn.Module):
-    def __init__(self, skip_mutable_buffer=False):
+    def __init__(self, skip_mutable_buffer=False, mode=0):
         super().__init__()
         self.skip_mutable_buffer = skip_mutable_buffer
         self.register_buffer(
             "k_cache",
-            torch.zeros((1, 1024, 12, 64), dtype=torch.float32),
+            torch.zeros((2, 1024, 12, 64), dtype=torch.float32),
             persistent=True,
         )
+        self.mode = mode
 
     def forward(self, input_pos, k_val):
-        k_out = torch.ops.aten.index_put_(self.k_cache, [None, input_pos], k_val)
+        match self.mode:
+            case 0:
+                k_out = torch.ops.aten.index_put_(self.k_cache, [input_pos], k_val)
+            case 1:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, input_pos], k_val
+                )
+            case 2:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, None, input_pos], k_val
+                )
+            case 3:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [input_pos[0], input_pos[1]], k_val
+                )
+            case 4:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [None, input_pos[0], input_pos[1]], k_val
+                )
+            case 5:
+                k_out = torch.ops.aten.index_put_(
+                    self.k_cache, [input_pos[0], None, input_pos[1]], k_val
+                )
+
         return k_out + 0
 
 
+class IndexPutSuite(torch.nn.Module):
+    def __init__(self, accumulate=False, in_place=False):
+        super().__init__()
+        self.accumulate = accumulate
+        self.in_place = in_place
+
+    def forward(self, x, indices, values):
+        if self.in_place:
+            # Clone the input to avoid modifying it in-place
+            result = x.clone()
+            # Apply index_put_ and return the modified tensor
+            result.index_put_(indices, values, self.accumulate)
+            return result
+        else:
+            # Use the non-in-place variant which returns a new tensor
+            return torch.index_put(x, indices, values, self.accumulate)
+
+
 class IndexSelect(torch.nn.Module):
     def __init__(self, dim):
         super().__init__()
@@ -1262,20 +1377,20 @@ def forward(self, x):
         return self.max_pool2d(x)
 
 
-class MeanWKeppDim(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return torch.mean(x, (-1, -2), keepdim=True)
-
-
-class MeanWOKeppDim(torch.nn.Module):
-    def __init__(self):
+class Mean(torch.nn.Module):
+    def __init__(
+        self,
+        dim: Optional[Union[int, Tuple[int, ...], List[int]]] = None,
+        keepdim: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ):
         super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+        self.dtype = dtype
 
     def forward(self, x):
-        return torch.mean(x, (-1, -2))
+        return torch.mean(x, dim=self.dim, keepdim=self.keepdim, dtype=self.dtype)
 
 
 class MaskedFill(torch.nn.Module):
@@ -1436,6 +1551,15 @@ def forward(self, x):
         )
 
 
+class Permute(torch.nn.Module):
+    def __init__(self, dims: List[int]):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x):
+        return x.permute(self.dims)
+
+
 class PixelShuffle(torch.nn.Module):
     def __init__(self, scale):
         super().__init__()
@@ -1469,11 +1593,12 @@ def forward(self, x):
 
 
 class PowTensorScalar(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, exponent=2):
         super().__init__()
+        self.exponent = exponent
 
     def forward(self, x):
-        return torch.pow(x, 2)
+        return torch.pow(x, self.exponent)
 
 
 class PReLUDefault(torch.nn.Module):
@@ -1854,6 +1979,28 @@ def forward(self, x, y):
         return torch.sub(x, y)
 
 
+class SubAlpha(torch.nn.Module):
+    def __init__(self, alpha):
+        super().__init__()
+        self.alpha = alpha
+
+    def forward(self, x, y):
+        return torch.sub(x, y, alpha=self.alpha)
+
+
+class SubAlphaConstant(torch.nn.Module):
+    def __init__(self, alpha, constant_first=False):
+        super().__init__()
+        self.alpha = alpha
+        self.constant_first = constant_first
+
+    def forward(self, x):
+        if self.constant_first:
+            return torch.sub(5.0, x, alpha=self.alpha)
+        else:
+            return torch.sub(x, 5.0, alpha=self.alpha)
+
+
 class SubConstantFloat(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1890,6 +2037,16 @@ def forward(self, x):
         return torch.sum(x, dim=(2, 3), keepdim=True)
 
 
+class SwapAxes(torch.nn.Module):
+    def __init__(self, axis0, axis1):
+        super().__init__()
+        self.axis0 = axis0
+        self.axis1 = axis1
+
+    def forward(self, x):
+        return torch.swapaxes(x, axis0=self.axis0, axis1=self.axis1)
+
+
 class Tanh(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1898,6 +2055,19 @@ def forward(self, x):
         return torch.tanh(x)
 
 
+class Threshold(torch.nn.Module):
+    def __init__(self, threshold=0.0, value=0.0, inplace=False):
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+
+    def forward(self, x):
+        return torch.nn.functional.threshold(
+            x, threshold=self.threshold, value=self.value, inplace=self.inplace
+        )
+
+
 class TopKandIndex(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1916,6 +2086,16 @@ def forward(self, x):
         return torch.unbind(x)
 
 
+class Unflatten(torch.nn.Module):
+    def __init__(self, dim, sizes):
+        super().__init__()
+        self.dim = dim
+        self.sizes = sizes
+
+    def forward(self, x):
+        return torch.unflatten(x, dim=self.dim, sizes=self.sizes)
+
+
 class Unfold(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_passes.py b/backends/qualcomm/tests/test_passes.py
new file mode 100644
index 00000000000..94a5d08acc1
--- /dev/null
+++ b/backends/qualcomm/tests/test_passes.py
@@ -0,0 +1,54 @@
+import unittest
+
+import torch
+from executorch.backends.qualcomm._passes import InsertReshapeForReduceOps
+
+
+class TestPasses(unittest.TestCase):
+    def test_insert_reshape_for_argmax(self):
+        class ArgmaxModule(torch.nn.Module):
+            def forward(self, x):
+                return torch.argmax(x, dim=None)
+
+        mod = ArgmaxModule()
+
+        x = torch.tensor([[1.0, 5.0], [3.0, 2.0]])
+        ep = torch.export.export(mod, (x,))
+        # Run original module for reference
+        ref = mod(x)
+
+        reshape_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default
+        ]
+        argmax_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default
+        ]
+        self.assertTrue(len(reshape_nodes) == 0, "Reshape node not inserted")
+        self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing")
+
+        InsertReshapeForReduceOps()(ep.graph_module)
+
+        out = ep.graph_module(x)
+
+        # Check graph structure: argmax should take a reshape as input
+        reshape_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.reshape.default
+        ]
+        argmax_nodes = [
+            n for n in ep.graph.nodes if n.target == torch.ops.aten.argmax.default
+        ]
+        self.assertTrue(len(reshape_nodes) == 1, "Reshape node should be inserted")
+        self.assertTrue(len(argmax_nodes) == 1, "Argmax node missing")
+
+        argmax_node = argmax_nodes[0]
+        self.assertEqual(argmax_node.args[1], 0, "Argmax dim not set to 0")
+
+        # Execute new graph and compare with reference
+        out = ep.graph_module(x)
+        self.assertTrue(
+            torch.equal(*out, ref), f"Output mismatch: got {out}, expected {ref}"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 5a86d5f286d..2641acc5a2d 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import io
+import itertools
 import json
 import subprocess
 import sys
@@ -173,14 +174,64 @@ def test_qnn_backend_arange(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_argmax(self):
-        module = Argmax()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmax(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
 
     def test_qnn_backend_argmin(self):
-        module = Argmin()  # noqa: F405
-        sample_input = (torch.rand(3, 4),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmin(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
 
     @unittest.expectedFailure
     def test_qnn_backend_asin(self):
@@ -232,7 +283,7 @@ def test_qnn_backend_cast(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_cat(self):
-        modules = [Cat2(), Cat3(), Cat4()]  # noqa: F405
+        modules = [Cat2(), Cat3(), Cat4(), Cat5()]  # noqa: F405
         sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2))
         for i, module in enumerate(modules):
             with self.subTest(i=i):
@@ -282,6 +333,13 @@ def test_qnn_backend_conv2d_channel_last(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv3d_sequential(self):
+        modules = [Conv3dSequential(), Conv3dSequential(bias=False)]  # noqa: F405
+        sample_input = (torch.randn([2, 1, 10, 32, 32]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv_transpose1d(self):
         modules = [
             ConvTranspose1dSingle(),  # noqa: F405
@@ -306,6 +364,18 @@ def test_qnn_backend_conv_transpose2d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv_transpose3d(self):
+        modules = [
+            ConvTranspose3dSingle(),  # noqa: F405
+            ConvTranspose3dSingle(bias=False),  # noqa: F405
+            ConvTranspose3dSingle(dilation=2),  # noqa: F405
+            ConvTranspose3dSingle(dilation=(3, 2, 3)),  # noqa: F405
+        ]
+        sample_input = (torch.randn([1, 1, 3, 3, 3]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_cos(self):
         module = Cos()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -328,8 +398,8 @@ def test_qnn_backend_cumsum(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
@@ -372,6 +442,24 @@ def test_qnn_backend_element_wise_add(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [(torch.randint(0, 10, size=(2, 3)),)],
             },
+            {
+                QCOM_MODULE: [
+                    AddAlpha(alpha=2),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (
+                        torch.tensor([[1.2, 1.3, 1.4]]),
+                        torch.tensor([[0.8, 1.6, 0.2]]),
+                    )
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    AddAlphaConstant(alpha=2, constant_first=True),  # noqa: F405
+                    AddAlphaConstant(alpha=2, constant_first=False),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)],
+            },
         ]
 
         index = 0
@@ -379,8 +467,8 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_and(self):
         module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
@@ -418,8 +506,8 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_mul(self):
         test_comb = [
@@ -445,8 +533,8 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_element_wise_or(self):
         test_comb = [
@@ -495,6 +583,24 @@ def test_qnn_backend_element_wise_sub(self):
                 QCOM_MODULE: [SubConstantFloat()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)],
             },
+            {
+                QCOM_MODULE: [
+                    SubAlpha(alpha=2),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (
+                        torch.tensor([[1.2, 1.3, 1.4]]),
+                        torch.tensor([[0.8, 1.6, 0.2]]),
+                    )
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    SubAlphaConstant(alpha=2, constant_first=True),  # noqa: F405
+                    SubAlphaConstant(alpha=2, constant_first=False),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)],
+            },
         ]
 
         index = 0
@@ -502,8 +608,8 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     @unittest.expectedFailure
     def test_qnn_backend_elu(self):
@@ -545,10 +651,10 @@ def test_qnn_backend_expand(self):
         for module in modules:
             for sample_input in sample_inputs:
                 with self.subTest(i=index):
+                    index += 1
                     self.lower_module_and_test_output(
                         module, sample_input, passes_job=passes_job
                     )
-                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -571,6 +677,21 @@ def test_qnn_backend_floor_divide(self):
             {
                 QCOM_MODULE: [FloorDiv()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [
+                    (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)),
+                    (
+                        torch.randint(-100, 100, (10, 10)).float(),
+                        torch.full((10, 10), 2.5),
+                    ),
+                    (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)),
+                    (torch.tensor([10]), torch.arange(1, 5)),  # Failed
+                    (torch.arange(-10, 10), torch.tensor([2])),
+                    (torch.randint(-100, 100, (20,)), torch.full((20,), 2)),
+                    (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)),
+                    (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)),
+                    (
+                        torch.randint(-100, 100, (2, 3, 4, 5)),
+                        torch.full((2, 3, 4, 5), 2),
+                    ),
                     (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
                     (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
@@ -586,8 +707,8 @@ def test_qnn_backend_floor_divide(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_fold(self):
         sample_input = (torch.randn(3, 512, 256),)
@@ -631,6 +752,13 @@ def test_qnn_backend_gelu(self):
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_glu(self):
+        modules = [torch.nn.GLU(), torch.nn.GLU(dim=0)]
+        sample_input = (torch.randn(2, 5, 1, 4),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_greater_equal(self):
         test_comb = [
             {
@@ -760,28 +888,191 @@ def test_qnn_backend_index_copy(self):
                 )
 
     def test_qnn_backend_index_put(self):
-        test_comb = [
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=False),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+        skip_mutable_buffer = [False, True]
+        total_test_combo = []
+        # mode 0
+        sample_inputs = [
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 1
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 2
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 3
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=True),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+                torch.randn([2, 12, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
+                torch.randn([1, 64]),
+            ),
         ]
-        for i, test in enumerate(test_comb):
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 4
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([2, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1, 64]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 5
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+
+        for i, test_combo in enumerate(total_test_combo):
+            for j, combo in enumerate(test_combo):
+                with self.subTest(f"mode_{i}-{j}"):
+                    self.lower_module_and_test_output(
+                        IndexPut(skip_mutable_buffer=combo[0], mode=i),  # noqa: F405
+                        combo[1],
+                        skip_mutable_buffer=combo[0],
+                    )
+
+    def test_qnn_backend_index_put_suite(self):
+        accumulate = [False, True]
+        in_place = [False, True]
+        sample_inputs = [
+            # basic
+            (
+                torch.rand(5, 2) * 100,
+                (torch.tensor([0, 2]),),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            # shape
+            (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            (
+                torch.rand(5, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (
+                torch.rand(5, 3, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            # TODO: not supported by HTP
+            # (
+            #     torch.rand(5, 3, 2, 4),
+            #     (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])),
+            #     torch.tensor([10.0]),
+            # ),
+            # indices
+            (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])),
+            (
+                torch.rand(5, 3),
+                (torch.tensor([0, 2, 4]),),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(5),
+                (torch.tensor([1, 1, 3, 3]),),
+                torch.tensor([10.0, 20.0, 30.0, 40.0]),
+            ),
+            # broadcasting
+            (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])),
+            (
+                torch.rand(3, 4),
+                (torch.tensor([0, 1]), torch.tensor([1, 2])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])),
+            (
+                torch.rand(3, 2, 2),
+                (torch.tensor([0, 1]),),
+                torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])),
+            # two-index
+            (
+                torch.rand(4, 3),
+                (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(3, 3),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([15.0, 25.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+        ]
+        test_combo = list(itertools.product(accumulate, in_place, sample_inputs))
+        for i, combo in enumerate(test_combo):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(
-                    test[QCOM_MODULE],
-                    test[QCOM_SAMPLE_INPUTS],
-                    skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
+                    IndexPutSuite(accumulate=combo[0], in_place=combo[1]),  # noqa: F405
+                    combo[2],
                 )
 
     def test_qnn_backend_index_select(self):
@@ -860,8 +1151,8 @@ def test_qnn_backend_leaky_relu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_less_equal(self):
         test_comb = [
@@ -956,12 +1247,61 @@ def test_qnn_backend_max_pool2d(self):
         sample_input = (torch.randn(4, 3, 24, 24),)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_mean_dim(self):
-        modules = [MeanWKeppDim(), MeanWOKeppDim()]  # noqa: F405
-        sample_input = (torch.randn([2, 5, 1, 3]),)
-        for i, module in enumerate(modules):
+    def test_qnn_backend_mean(self):
+        test_comb = [
+            # Reduce over last two dims, keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Reduce over last two dims, keepdim=False
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Default: reduce all dims
+            {
+                QCOM_MODULE: Mean(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+            },
+            # TODO: To be enabled via reshape input to 1d tensor
+            # # Scalar case
+            # {
+            #     QCOM_MODULE: Mean(),
+            #     QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+            # },
+            # Edge case: dim is a empty list
+            {
+                QCOM_MODULE: Mean(dim=[]),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 (batch dimension)
+            {
+                QCOM_MODULE: Mean(dim=0),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 with keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along multiple dims
+            {
+                QCOM_MODULE: Mean(dim=(0, 2)),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+            },
+            # Edge case: high-dimensional tensor
+            {
+                QCOM_MODULE: Mean(dim=(1, 3), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+            },
+        ]
+
+        for i, test in enumerate(test_comb):
             with self.subTest(i=i):
-                self.lower_module_and_test_output(module, sample_input)
+                self.lower_module_and_test_output(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
 
     @unittest.skip("failed to lower in QNN 2.26")
     def test_qnn_backend_mha(self):
@@ -1006,6 +1346,16 @@ def test_qnn_backend_pad(self):
         sample_input = (torch.randn([1, 8, 128]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_permute(self):
+        modules = [
+            Permute([0, 2, 3, 1]),  # noqa: F405
+            Permute([-1, -3, -2, -4]),  # noqa: F405
+        ]
+        sample_input = (torch.randn([2, 3, 4, 5]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_shuffle(self):
         module = PixelShuffle(2)  # noqa: F405
         sample_input = (torch.ones([2, 4, 3, 3]),)
@@ -1017,9 +1367,28 @@ def test_qnn_backend_pixel_unshuffle(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_pow_tensor_scalar(self):
-        module = PowTensorScalar()  # noqa: F405
-        sample_input = (torch.rand([2, 4, 3, 3]),)
-        self.lower_module_and_test_output(module, sample_input)
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    PowTensorScalar(),  # noqa: F405
+                    PowTensorScalar(1),  # noqa: F405
+                    PowTensorScalar(-1),  # noqa: F405
+                    PowTensorScalar(0.5),  # noqa: F405
+                ],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)],
+            },
+            {
+                QCOM_MODULE: [PowTensorScalar(10)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)],
+            },
+        ]
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_prelu(self):
         test_comb = [
@@ -1038,8 +1407,8 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_relu(self):
         module = Relu()  # noqa: F405
@@ -1154,10 +1523,8 @@ def test_qnn_backend_slice_scatter(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [
                     (
-                        (
-                            torch.zeros(8, 8),
-                            torch.ones(8, 2),
-                        )
+                        torch.zeros(8, 8),
+                        torch.ones(8, 2),
                     )
                 ],
             },
@@ -1168,8 +1535,8 @@ def test_qnn_backend_slice_scatter(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        self.lower_module_and_test_output(module, sample_input)
                         index += 1
+                        self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_stack(self):
         module = Stack()  # noqa: F405
@@ -1202,11 +1569,32 @@ def test_qnn_backend_sum_int_list(self):
         sample_input = (torch.randn([1, 4, 8, 8]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_swapaxes(self):
+        module = SwapAxes(0, 1)  # noqa: F405
+        sample_input = (torch.randn([1, 2, 3, 4]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_tanh(self):
         module = Tanh()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_threshold(self):
+        modules = [
+            Threshold(),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=True),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
+    def test_qnn_backend_unflatten(self):
+        module = Unflatten(dim=1, sizes=(2, 3, 4))  # noqa: F405
+        sample_input = (torch.randn([1, 24]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_unbind(self):
         module = Unbind()  # noqa: F405
         sample_input = (torch.randn([3, 3]),)
@@ -1638,16 +2026,66 @@ def test_qnn_backend_arange(self):
                 self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_argmax(self):
-        module = Argmax()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmax(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmax(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_argmin(self):
-        module = Argmin()  # noqa: F405
-        sample_input = (torch.randn(16, 3, 4, 4),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_cases = [
+            {
+                QCOM_MODULE: Argmin(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(16, 3, 4, 4),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=1, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(8, 5),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=None, keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.tensor([5.0]),),
+            },
+            {
+                QCOM_MODULE: Argmin(dim=2, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4),),
+            },
+        ]
+
+        for i, case in enumerate(test_cases):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(
+                    case[QCOM_MODULE], case[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, case[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_asin(self):
         module = Asin()  # noqa: F405
@@ -1699,7 +2137,7 @@ def test_qnn_backend_cast(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_cat(self):
-        modules = [Cat2(), Cat3(), Cat4()]  # noqa: F405
+        modules = [Cat2(), Cat3(), Cat4(), Cat5()]  # noqa: F405
         sample_input = (torch.randn(1, 1, 2, 2), torch.randn(1, 1, 4, 2))
         for i, module in enumerate(modules):
             with self.subTest(i=i):
@@ -1789,6 +2227,14 @@ def test_qnn_backend_conv2d_channel_last(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv3d_sequential(self):
+        modules = [Conv3dSequential(), Conv3dSequential(bias=False)]  # noqa: F405
+        sample_input = (torch.randn([2, 1, 10, 32, 32]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                qdq_module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(qdq_module, sample_input)
+
     def test_qnn_backend_conv_transpose1d(self):
         modules = [
             ConvTranspose1dSingle(),  # noqa: F405
@@ -1814,6 +2260,19 @@ def test_qnn_backend_conv_transpose2d(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conv_transpose3d(self):
+        modules = [
+            ConvTranspose3dSingle(),  # noqa: F405
+            ConvTranspose3dSingle(bias=False),  # noqa: F405
+            ConvTranspose3dSingle(dilation=2),  # noqa: F405
+            ConvTranspose3dSingle(dilation=(3, 2, 3)),  # noqa: F405
+        ]
+        sample_input = (torch.randn([1, 1, 3, 3, 3]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_cos(self):
         module = Cos()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
@@ -1863,6 +2322,24 @@ def test_qnn_backend_element_wise_add(self):
                 QCOM_MODULE: [AddConstantFloat(), AddConstantLong()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)],
             },
+            {
+                QCOM_MODULE: [
+                    AddAlpha(alpha=2),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (
+                        torch.tensor([[1.2, 1.3, 1.4]]),
+                        torch.tensor([[0.8, 1.6, 0.2]]),
+                    )
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    AddAlphaConstant(alpha=2, constant_first=True),  # noqa: F405
+                    AddAlphaConstant(alpha=2, constant_first=False),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)],
+            },
         ]
 
         index = 0
@@ -1870,9 +2347,9 @@ def test_qnn_backend_element_wise_add(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_and(self):
         module = And(torch.tensor(1.7), torch.tensor(0.2))  # noqa: F405
@@ -1911,9 +2388,9 @@ def test_qnn_backend_element_wise_div(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_mul(self):
         test_comb = [
@@ -1939,9 +2416,9 @@ def test_qnn_backend_element_wise_mul(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_element_wise_or(self):
         test_comb = [
@@ -1992,6 +2469,24 @@ def test_qnn_backend_element_wise_sub(self):
                 QCOM_MODULE: [SubConstantFloat(), SubConstantLong()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)],
             },
+            {
+                QCOM_MODULE: [
+                    SubAlpha(alpha=2),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [
+                    (
+                        torch.tensor([[1.2, 1.3, 1.4]]),
+                        torch.tensor([[0.8, 1.6, 0.2]]),
+                    )
+                ],
+            },
+            {
+                QCOM_MODULE: [
+                    SubAlphaConstant(alpha=2, constant_first=True),  # noqa: F405
+                    SubAlphaConstant(alpha=2, constant_first=False),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [(torch.tensor([[1.2, 1.3, 1.4]]),)],
+            },
         ]
 
         index = 0
@@ -1999,9 +2494,9 @@ def test_qnn_backend_element_wise_sub(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         gm = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(gm, sample_input)
-                        index += 1
 
     def test_qnn_backend_elu(self):
         module = Elu()  # noqa: F405
@@ -2050,11 +2545,11 @@ def test_qnn_backend_expand(self):
         for module in modules:
             for sample_input in sample_inputs:
                 with self.subTest(i=index):
+                    index += 1
                     module = self.get_qdq_module(module, sample_input)
                     self.lower_module_and_test_output(
                         module, sample_input, passes_job=passes_job
                     )
-                    index += 1
 
     def test_qnn_backend_expm1(self):
         sample_input = (torch.randn(3, 4, 5),)
@@ -2080,6 +2575,21 @@ def test_qnn_backend_floor_divide(self):
             {
                 QCOM_MODULE: [FloorDiv()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [
+                    (torch.randint(-100, 100, (10, 10)), torch.full((10, 10), 3)),
+                    (
+                        torch.randint(-100, 100, (10, 10)).float(),
+                        torch.full((10, 10), 2.5),
+                    ),
+                    (torch.randint(-1000, 1000, (10, 10)), torch.full((10, 10), 100)),
+                    (torch.tensor([10]), torch.arange(1, 5)),
+                    (torch.arange(-10, 10), torch.tensor([2])),
+                    (torch.randint(-100, 100, (20,)), torch.full((20,), 2)),
+                    (torch.randint(-100, 100, (5, 10)), torch.full((5, 10), 2)),
+                    (torch.randint(-100, 100, (3, 4, 5)), torch.full((3, 4, 5), 2)),
+                    (
+                        torch.randint(-100, 100, (2, 3, 4, 5)),
+                        torch.full((2, 3, 4, 5), 2),
+                    ),
                     (torch.randn(2, 5, 1, 3), eps + torch.randn(2, 5, 1, 3)),
                     (torch.randn([2, 5, 1, 3]), eps + torch.randn([4, 1])),
                 ],
@@ -2095,9 +2605,12 @@ def test_qnn_backend_floor_divide(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
-                        gm = self.get_qdq_module(module, sample_input)
-                        self.lower_module_and_test_output(gm, sample_input)
                         index += 1
+                        # Support int input cases with bypass_check=True
+                        gm = self.get_qdq_module(
+                            module, sample_input, bypass_check=True
+                        )
+                        self.lower_module_and_test_output(gm, sample_input)
 
     def test_qnn_backend_fold(self):
         sample_input = (torch.randn(3, 512, 256),)
@@ -2146,6 +2659,14 @@ def test_qnn_backend_gelu(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_glu(self):
+        modules = [torch.nn.GLU(), torch.nn.GLU(dim=0)]
+        sample_input = (torch.randn(2, 5, 1, 4),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_greater_equal(self):
         test_comb = [
             {
@@ -2285,32 +2806,197 @@ def test_qnn_backend_index_copy(self):
                 )
 
     def test_qnn_backend_index_put(self):
-        test_comb = [
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=False),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+        skip_mutable_buffer = [False, True]
+        total_test_combo = []
+        # mode 0
+        sample_inputs = [
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([0], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([2, 1, 12, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 1
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 12, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 2, 12, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 2
+        sample_inputs = [
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 1, 1, 64])),
+            (torch.tensor([2], dtype=torch.int32), torch.randn([1, 64])),
+            (torch.tensor([0, 1], dtype=torch.int32), torch.randn([1, 1, 2, 64])),
+            (torch.tensor([2, 3], dtype=torch.int32), torch.randn([1, 64])),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 3
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
-            {
-                QCOM_MODULE: IndexPut(skip_mutable_buffer=True),  # noqa: F405
-                QCOM_SAMPLE_INPUTS: (
-                    torch.tensor([2], dtype=torch.int32),
-                    torch.randn([1, 1, 12, 64]),
+                torch.randn([2, 12, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
                 ),
-            },
+                torch.randn([1, 64]),
+            ),
         ]
-        for i, test in enumerate(test_comb):
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 4
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([2, 64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1, 64]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+        # mode 5
+        sample_inputs = [
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([64]),
+            ),
+            (
+                (
+                    torch.tensor([0, 1], dtype=torch.int32),
+                    torch.tensor([2, 3], dtype=torch.int32),
+                ),
+                torch.randn([1]),
+            ),
+        ]
+        total_test_combo.append(
+            list(itertools.product(skip_mutable_buffer, sample_inputs))
+        )
+
+        for i, test_combo in enumerate(total_test_combo):
+            for j, combo in enumerate(test_combo):
+                with self.subTest(f"mode_{i}-{j}"):
+                    module = self.get_qdq_module(
+                        IndexPut(skip_mutable_buffer=combo[0], mode=i),  # noqa: F405
+                        combo[1],
+                    )
+                    self.lower_module_and_test_output(
+                        module,
+                        combo[1],
+                        skip_mutable_buffer=combo[0],
+                    )
+
+    def test_qnn_backend_index_put_suite(self):
+        accumulate = [False, True]
+        in_place = [False, True]
+        sample_inputs = [
+            # basic
+            (
+                torch.rand(5, 2) * 100,
+                (torch.tensor([0, 2]),),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(5, 2), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            # shape
+            (torch.rand(5), (torch.tensor([0, 2]),), torch.tensor([10.0, 20.0])),
+            (
+                torch.rand(5, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (
+                torch.rand(5, 3, 2),
+                (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            # TODO: not supported by HTP
+            # (
+            #     torch.rand(5, 3, 2, 4),
+            #     (torch.tensor([0, 2]), torch.tensor([1, 1]), torch.tensor([0, 1]), torch.tensor([2, 3])),
+            #     torch.tensor([10.0]),
+            # ),
+            # indices
+            (torch.rand(5, 2), (torch.tensor([2]),), torch.tensor([10.0])),
+            (
+                torch.rand(5, 3),
+                (torch.tensor([0, 2, 4]),),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(5),
+                (torch.tensor([1, 1, 3, 3]),),
+                torch.tensor([10.0, 20.0, 30.0, 40.0]),
+            ),
+            # broadcasting
+            (torch.rand(5, 3), (torch.tensor([0, 2, 4]),), torch.tensor([42.0])),
+            (
+                torch.rand(3, 4),
+                (torch.tensor([0, 1]), torch.tensor([1, 2])),
+                torch.tensor([10.0, 20.0]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([0, 2]),), torch.tensor([5.0, 15.0])),
+            (
+                torch.rand(3, 2, 2),
+                (torch.tensor([0, 1]),),
+                torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            ),
+            (torch.rand(4, 2), (torch.tensor([1, 1, 1]),), torch.tensor([5.0])),
+            # two-index
+            (
+                torch.rand(4, 3),
+                (torch.tensor([0, 1, 2]), torch.tensor([1, 0, 2])),
+                torch.tensor([10.0, 20.0, 30.0]),
+            ),
+            (
+                torch.rand(3, 3),
+                (torch.tensor([0, 2]), torch.tensor([1, 1])),
+                torch.tensor([15.0, 25.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1, 1, 2]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+            (
+                torch.rand(3, 2),
+                (torch.tensor([1]), torch.tensor([0, 0, 1])),
+                torch.tensor([5.0, 10.0, 15.0]),
+            ),
+        ]
+        test_combo = list(itertools.product(accumulate, in_place, sample_inputs))
+        for i, combo in enumerate(test_combo):
             with self.subTest(i=i):
                 module = self.get_qdq_module(
-                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
-                )
-                self.lower_module_and_test_output(
-                    module,
-                    test[QCOM_SAMPLE_INPUTS],
-                    skip_mutable_buffer=test[QCOM_MODULE].skip_mutable_buffer,
+                    IndexPutSuite(accumulate=combo[0], in_place=combo[1]),  # noqa: F405
+                    combo[2],
                 )
+                self.lower_module_and_test_output(module, combo[2])
 
     def test_qnn_backend_index_select(self):
         module = IndexSelect(dim=1)  # noqa: F405
@@ -2395,9 +3081,9 @@ def test_qnn_backend_leaky_relu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_less_equal(self):
         test_comb = [
@@ -2529,13 +3215,62 @@ def test_qnn_backend_max_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
-    def test_qnn_backend_mean_dim(self):
-        modules = [MeanWKeppDim(), MeanWOKeppDim()]  # noqa: F405
-        sample_input = (torch.randn([2, 5, 1, 3]),)
-        for i, module in enumerate(modules):
+    def test_qnn_backend_mean(self):
+        test_comb = [
+            # Reduce over last two dims, keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Reduce over last two dims, keepdim=False
+            {
+                QCOM_MODULE: Mean(dim=(-1, -2), keepdim=False),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn([2, 5, 1, 3]),),
+            },
+            # Default: reduce all dims
+            {
+                QCOM_MODULE: Mean(),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(10, 10),),
+            },
+            # TODO: To be enabled via reshape input to 1d tensor
+            # Scalar case
+            # {
+            #     QCOM_MODULE: Mean(),
+            #     QCOM_SAMPLE_INPUTS: (torch.tensor(5.0),),
+            # },
+            # Edge case: dim is a empty list
+            {
+                QCOM_MODULE: Mean(dim=[]),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 (batch dimension)
+            {
+                QCOM_MODULE: Mean(dim=0),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along dim=0 with keepdim=True
+            {
+                QCOM_MODULE: Mean(dim=0, keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(4, 6, 8),),
+            },
+            # Edge case: reduce along multiple dims
+            {
+                QCOM_MODULE: Mean(dim=(0, 2)),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(3, 4, 5),),
+            },
+            # Edge case: high-dimensional tensor
+            {
+                QCOM_MODULE: Mean(dim=(1, 3), keepdim=True),  # noqa: F405
+                QCOM_SAMPLE_INPUTS: (torch.randn(2, 3, 4, 5, 6),),
+            },
+        ]
+
+        for i, test in enumerate(test_comb):
             with self.subTest(i=i):
-                module = self.get_qdq_module(module, sample_input)
-                self.lower_module_and_test_output(module, sample_input)
+                module = self.get_qdq_module(
+                    test[QCOM_MODULE], test[QCOM_SAMPLE_INPUTS]
+                )
+                self.lower_module_and_test_output(module, test[QCOM_SAMPLE_INPUTS])
 
     def test_qnn_backend_mha(self):
         module = MultiheadAttention()  # noqa: F405
@@ -2585,6 +3320,17 @@ def test_qnn_backend_pad(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_permute(self):
+        modules = [
+            Permute([0, 2, 3, 1]),  # noqa: F405
+            Permute([-1, -3, -2, -4]),  # noqa: F405
+        ]
+        sample_input = (torch.randn([2, 3, 4, 5]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_pixel_shuffle(self):
         module = PixelShuffle(2)  # noqa: F405
         sample_input = (torch.ones([2, 4, 3, 3]),)
@@ -2598,10 +3344,29 @@ def test_qnn_backend_pixel_unshuffle(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_pow_tensor_scalar(self):
-        module = PowTensorScalar()  # noqa: F405
-        sample_input = (torch.rand([2, 4, 3, 3]),)
-        module = self.get_qdq_module(module, sample_input)
-        self.lower_module_and_test_output(module, sample_input)
+        test_comb = [
+            {
+                QCOM_MODULE: [
+                    PowTensorScalar(),  # noqa: F405
+                    PowTensorScalar(1),  # noqa: F405
+                    PowTensorScalar(-1),  # noqa: F405
+                    PowTensorScalar(0.5),  # noqa: F405
+                ],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) + 0.1,)],
+            },
+            {
+                QCOM_MODULE: [PowTensorScalar(10)],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [(torch.rand(10, 10) * 0.5 + 0.5,)],
+            },
+        ]
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        index += 1
+                        qdq_module = self.get_qdq_module(module, sample_input)
+                        self.lower_module_and_test_output(qdq_module, sample_input)
 
     def test_qnn_backend_prelu(self):
         test_comb = [
@@ -2620,9 +3385,9 @@ def test_qnn_backend_prelu(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_relu(self):
         module = Relu()  # noqa: F405
@@ -2760,10 +3525,8 @@ def test_qnn_backend_slice_scatter(self):
                 ],
                 QCOM_SAMPLE_INPUTS: [
                     (
-                        (
-                            torch.zeros(8, 8),
-                            torch.ones(8, 2),
-                        )
+                        torch.zeros(8, 8),
+                        torch.ones(8, 2),
                     )
                 ],
             },
@@ -2774,9 +3537,9 @@ def test_qnn_backend_slice_scatter(self):
             for module in comb[QCOM_MODULE]:
                 for sample_input in comb[QCOM_SAMPLE_INPUTS]:
                     with self.subTest(i=index):
+                        index += 1
                         module = self.get_qdq_module(module, sample_input)
                         self.lower_module_and_test_output(module, sample_input)
-                        index += 1
 
     def test_qnn_backend_softmax(self):
         modules = [Softmax(dim=1), Softmax(dim=-1)]  # noqa: F405
@@ -2814,12 +3577,36 @@ def test_qnn_backend_sum_int_list(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_swapaxes(self):
+        module = SwapAxes(0, 1)  # noqa: F405
+        sample_input = (torch.randn([1, 2, 3, 4]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_tanh(self):
         module = Tanh()  # noqa: F405
         sample_input = (torch.randn(2, 5, 1, 3),)
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_threshold(self):
+        modules = [
+            Threshold(),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=True),  # noqa: F405
+            Threshold(threshold=0.5, value=3.0, inplace=False),  # noqa: F405
+        ]
+        sample_input = (torch.randn(2, 5, 1, 3),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                qdq_module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(qdq_module, sample_input)
+
+    def test_qnn_backend_unflatten(self):
+        module = Unflatten(dim=1, sizes=(2, 3, 4))  # noqa: F405
+        sample_input = (torch.randn([1, 24]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_unbind(self):
         module = Unbind()  # noqa: F405
         sample_input = (torch.randn([3, 3]),)
@@ -2943,6 +3730,51 @@ def test_qnn_backend_chunk_add(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_conformer(self):
+        from typing import Tuple
+
+        import torchaudio
+
+        class PatchedConformer(torch.nn.Module):
+            """
+            A lightly modified version of the top-level Conformer module, such that it can be exported.
+            Instead of taking lengths and computing the padding mask, it takes the padding mask directly.
+            See https://github.com/pytorch/audio/blob/main/src/torchaudio/models/conformer.py#L215
+            """
+
+            def __init__(self, conformer):
+                super().__init__()
+                self.conformer = conformer
+
+            def forward(
+                self, input: torch.Tensor, encoder_padding_mask: torch.Tensor
+            ) -> Tuple[torch.Tensor, torch.Tensor]:
+                x = input.transpose(0, 1)
+                for layer in self.conformer.conformer_layers:
+                    x = layer(x, encoder_padding_mask)
+                return x.transpose(0, 1)
+
+        inner_model = torchaudio.models.Conformer(
+            input_dim=80,
+            num_heads=4,
+            ffn_dim=128,
+            num_layers=4,
+            depthwise_conv_kernel_size=31,
+        )
+        lengths = torch.randint(1, 400, (10,))
+        encoder_padding_mask = torchaudio.models.conformer._lengths_to_padding_mask(
+            lengths
+        )
+        sample_input = (
+            torch.rand(10, int(lengths.max()), 80),
+            encoder_padding_mask.to(torch.float32),
+        )
+        module = PatchedConformer(inner_model).eval()
+        module = self.get_qdq_module(
+            module, sample_input, quant_dtype=QuantDtype.use_16a8w
+        )
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv1d_relu_log_softmax(self):
         modules = [
             Conv1dReluLogSoftmax(dim=1),  # noqa: F405
@@ -4680,6 +5512,65 @@ def test_qnn_backend_seq_mse(self):
 
 
 class TestExampleLLMScript(TestQNN):
+    def test_static_gemma_2b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "My favourite condiment is "
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--decoder_model",
+            "gemma-2b",
+            "--model_mode",
+            "kv",
+            "--max_seq_len",
+            "1024",
+            "--eval_perplexity",
+            "--tasks",
+            "wikitext",
+            "--limit",
+            "1",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                inference_speed_ref = {"SM8650": 32, "SM8750": 36}
+                self.assertLessEqual(msg["wiki_ppl"], 35)
+                self.assertLessEqual(msg["pte_size"], 2_700_000_000)  # 2.7GB
+                if self.model in inference_speed_ref:
+                    self.assertGreaterEqual(
+                        msg["inference_speed"], inference_speed_ref[self.model]
+                    )
+
     def test_static_gemma3_1b(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
@@ -5438,6 +6329,43 @@ def test_conv_former(self):
                 self.assertGreaterEqual(msg["top_1"], 70)
                 self.assertGreaterEqual(msg["top_5"], 92)
 
+    def test_convnext_small(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/convnext_small.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--seed",
+            str(1126),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 76)
+                self.assertGreaterEqual(msg["top_5"], 97)
+
     def test_cvt(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -5936,6 +6864,43 @@ def test_gMLP(self):
                 self.assertGreaterEqual(msg["top_1"], 70)
                 self.assertGreaterEqual(msg["top_5"], 88)
 
+    def test_maxvit_t(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/maxvit_t.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--seed",
+            str(1126),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 72)
+                self.assertGreaterEqual(msg["top_5"], 91)
+
     @unittest.skip("Only outputs good accuracy in QNN 2.29")
     def test_mobilevit_v2(self):
         if not self.required_envs([self.image_dataset]):
@@ -6282,6 +7247,43 @@ def test_swin_transformer(self):
                 self.assertGreaterEqual(msg["top_1"], 71)
                 self.assertGreaterEqual(msg["top_5"], 90)
 
+    def test_swin_v2_t(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/swin_v2_t.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--seed",
+            str(1126),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 63)
+                self.assertGreaterEqual(msg["top_5"], 92)
+
     def test_t5(self):
         if not self.required_envs([self.qa_dataset]):
             self.skipTest("missing required envs")
@@ -6318,6 +7320,43 @@ def test_t5(self):
             else:
                 self.assertGreaterEqual(msg["f1"], 0.72)
 
+    def test_vit_b_16(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/vit_b_16.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--seed",
+            str(1126),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 72)
+                self.assertGreaterEqual(msg["top_5"], 96)
+
     def test_whisper(self):
         if not self.required_envs():
             self.skipTest("missing required envs")
diff --git a/backends/samsung/CMakeLists.txt b/backends/samsung/CMakeLists.txt
index fff3ece5239..6ea020c0970 100644
--- a/backends/samsung/CMakeLists.txt
+++ b/backends/samsung/CMakeLists.txt
@@ -161,7 +161,7 @@ if(${ANDROID})
   install(
     TARGETS enn_backend enn_logging
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
 
diff --git a/backends/samsung/_passes/annotate_qparams.py b/backends/samsung/_passes/annotate_qparams.py
new file mode 100644
index 00000000000..663d1fdf5fa
--- /dev/null
+++ b/backends/samsung/_passes/annotate_qparams.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+from typing import Any, Dict, List, Optional
+
+import torch
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import get_buffer
+from torch.export import ExportedProgram
+from torch.fx import GraphModule, Node
+
+
+class AnnotateQparamsPass(ExportPass):
+    """This parse is to add quantize properties to node need to be quantized.
+
+    Annotate Quant params:
+        For src_node->Q->DQ->..., we will add the quant params from Q->DQ node
+         to the src_node
+
+    Annotate Requantize:
+        For src_node->Q->DQ->Q->DQ->..., if the multiple Q->DQ contains
+         different quant params, we will mark the src_node as need requantize,
+         and add Q->DQ after removing all the Q->DQs.
+    """
+
+    propagate_nodes = {
+        exir_ops.edge.aten.view_copy.default,
+        exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.squeeze_copy.default,
+        exir_ops.edge.aten.squeeze_copy.dim,
+        exir_ops.edge.aten.squeeze_copy.dims,
+        exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.unsqueeze_copy.default,
+        exir_ops.edge.aten.concat.default,
+        exir_ops.edge.aten.cat.default,
+        exir_ops.edge.aten.expand_copy.default,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def _get_last_dqs(self, node: Node) -> List[Node]:
+        r"""From one Q-DQ node, find the last DQs in the quantization node chain.
+
+
+        need to consider such case:
+                    /--Q-DQ-node1
+            node->Q->DQ--node-node2
+                    \--Q-DQ-node3
+        This is a dfs implemention, so result will keep sorted
+        Args:
+            node (Node): Search DQ from this node.
+
+        Returns:
+            List[Node]: list of DQ node by original sequence
+        """
+
+        def _impl(node: Node, res_list: List[Node]):
+            if (
+                node.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                and node.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+            ):
+                return
+            for user in node.users.keys():
+                if (
+                    user.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                    and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+                ):
+                    res_list.append(node)
+                else:
+                    _impl(user, res_list)
+
+        res_list: List[Node] = []
+        for user in node.users:
+            _impl(user, res_list)
+        return res_list
+
+    def _propagate_quant_params(self, node: Node):
+        assert (
+            quantize_attrs := node.meta.get("quantize_attrs")
+        ), "Must be annotated node."
+        requantize_map: Dict[Node, Node] = node.meta.get("requantize", {})
+        while node.users:
+            if len(node.users) != 1:
+                break
+            user = list(node.users.keys())[0]
+            if (
+                user.target not in QuantConstants.QUANT_OPS_KEY_MAP
+                and user.target not in QuantConstants.DEQUANT_OPS_KEY_MAP
+            ):
+                break
+            node = user
+        # Case1: ...-q-dq(cur)-propagate_node-node(not d-dq)
+        # Case2: propagate_node(propagateed)-propagate_node-node(not q-dq)
+        for idx, user in enumerate(node.users.keys()):
+            # For the branch who need to be requantized, we propagate the requantize params
+            user_attrs = requantize_map.get(idx, quantize_attrs)
+            if user.target not in self.propagate_nodes:
+                continue
+            if len(user.users) == 1:
+                # Possibily no need for checking len(users)>1
+                user_of_user = list(user.users)[0]
+                # node-q-dq-propagate-q-dq not need for propagatey
+                if (
+                    user_of_user.target in QuantConstants.QUANT_OPS_KEY_MAP
+                    or user_of_user.target in QuantConstants.DEQUANT_OPS_KEY_MAP
+                ):
+                    continue
+            # propagate quant for node-q-dq-propagate_node-node(not qdq)
+            user.meta["quantize_attrs"] = user_attrs
+            self._propagate_quant_params(user)
+
+    def _annotate_requantize(self, node: Node):
+        assert (
+            ori_quant_attrs := node.meta.get("quantize_attrs")
+        ), "No quant parameters found"
+        list_for_requantize = self._get_last_dqs(node)
+        node.meta["requantize"] = node.meta.get("requantize", {})
+
+        # We use index to mark the output to be requantized
+        # Because user obj and name may change when we requantize them.
+
+        def _check_same(requant_obj, ori_obj) -> bool:
+            if type(requant_obj) != type(ori_obj):  # noqa E721
+                # We need actually same type here.
+                return False
+            if not isinstance(requant_obj, torch.Tensor):
+                return requant_obj == ori_obj
+            if requant_obj.shape != ori_obj.shape:
+                return False
+            return bool((requant_obj == ori_obj).all())
+
+        requantize_map: Dict[int, Dict] = node.meta["requantize"]
+        for idx, dq in enumerate(list_for_requantize):
+            q = dq.all_input_nodes[0]
+            if q.target not in QuantConstants.QUANT_OPS_KEY_MAP:
+                continue
+            key_map = QuantConstants.DEQUANT_OPS_KEY_MAP[dq.target]
+            requantize_attrs = self.get_quant_attrs(q, key_map)
+            if not all(
+                _check_same(ori_quant_attrs[key], requantize_attrs[key])
+                for key in key_map.values()
+            ):
+                requantize_map[idx] = requantize_attrs
+
+    def _annotate(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            key_map = QuantConstants.QUANT_OPS_KEY_MAP.get(node.target, None)
+            if not key_map:
+                continue
+            source_node = node.args[0]
+            if source_node.target in (
+                *QuantConstants.QUANT_OPS_KEY_MAP,
+                *QuantConstants.DEQUANT_OPS_KEY_MAP,
+            ):
+                # Currently, don't add quant info for d_qd node here.
+                continue
+            elif source_node.target == operator.getitem:
+                source_node = source_node.args[0]
+            quant_attrs = self.get_quant_attrs(node, key_map)
+            source_node.meta["quantize_attrs"] = quant_attrs
+            self._annotate_requantize(source_node)
+            self._propagate_quant_params(source_node)
+
+    def call(self, graph_module: GraphModule):
+        self._annotate(graph_module)
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+
+    def get_quant_attrs(
+        self, quant_node: torch.fx.Node, key_map: Optional[Dict] = None
+    ) -> Dict[str, Any]:
+        quant_attr_keys = [arg.name for arg in quant_node.target._schema.arguments]
+        quant_attrs = dict.fromkeys(quant_attr_keys)
+        for key, attr in zip(quant_attr_keys[1:], quant_node.args[1:]):
+            # For channel-wise quantization, params are stored by buffer nodes.
+            if isinstance(attr, torch.fx.Node):
+                attr = get_buffer(self.edge_program, attr)
+            quant_attrs[key] = attr
+        quant_attrs["target"] = quant_node.target
+        if key_map is None:
+            return quant_attrs
+        miss_attrs = []
+        for aten_attr, snc_attr in key_map.items():
+            if aten_attr not in quant_attrs:
+                miss_attrs.append(aten_attr)
+                continue
+            attr = quant_attrs[aten_attr]
+            quant_attrs.pop(aten_attr)
+            quant_attrs[snc_attr] = attr
+        assert (
+            not miss_attrs
+        ), f"Miss quant attrs {miss_attrs} for node {quant_node.name}"
+        return quant_attrs
diff --git a/backends/samsung/_passes/annotate_scalar_parameters.py b/backends/samsung/_passes/annotate_scalar_parameters.py
new file mode 100644
index 00000000000..643685bdb25
--- /dev/null
+++ b/backends/samsung/_passes/annotate_scalar_parameters.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.samsung.quantizer.quantizer import global_quant_info
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.transforms.utils import get_param_tensor, is_param_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export import ExportedProgram
+
+
+class AnnotateScalarParametersPass(ExportPass):
+    """
+    Need to add quantization parameters for scalars for some ops
+    Ifm(Quantized)------TargetOP---
+    Scalar(Non-Quant)---/
+    Notice: Such scalars are converted to tensor node by default pass
+    """
+
+    TARGET_OPS = {
+        exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.div.Tensor,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def annotate(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target not in self.TARGET_OPS or "quantize_attrs" not in node.meta:
+                continue
+            torch_quant_dtype = global_quant_info.weight_precison.torch_dtype
+            for input_arg in node.all_input_nodes:
+                if input_arg.op not in ("placeholder", "get_attr") or not is_param_node(
+                    self.edge_program, input_arg
+                ):
+                    continue
+                else:
+                    tensor = get_param_tensor(self.edge_program, input_arg)
+                    if not tensor.shape:
+                        qparams = {
+                            QuantConstants.QUANT_KEY.scale: float(tensor),
+                            QuantConstants.QUANT_KEY.quant_dtype: torch_quant_dtype,
+                            QuantConstants.QUANT_KEY.quant_max: torch.iinfo(
+                                torch_quant_dtype
+                            ).max,
+                            QuantConstants.QUANT_KEY.quant_min: torch.iinfo(
+                                torch_quant_dtype
+                            ).min,
+                            QuantConstants.QUANT_KEY.zero_point: 0,
+                        }
+                        input_arg.meta["quantize_attrs"] = qparams
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        self.annotate(graph_module)
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/conv1d_to_conv2d.py b/backends/samsung/_passes/conv1d_to_conv2d.py
index 57f1074b348..1b8782d956b 100644
--- a/backends/samsung/_passes/conv1d_to_conv2d.py
+++ b/backends/samsung/_passes/conv1d_to_conv2d.py
@@ -5,84 +5,93 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+from executorch.backends.transforms.utils import get_param_tensor
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch._export.utils import get_param
 
 
 class Conv1dToConv2d(ExportPass):
-
     def __init__(self, edge_program: ExportedProgram):
         super().__init__()
         self.edge_program = edge_program
 
+    def update_kernel(self, weight_node: torch.Tensor):
+        # lifted tensor in tensor constant
+        weight_3d = get_param_tensor(self.edge_program, weight_node)
+        if param_name := self.edge_program.graph_signature.inputs_to_parameters.get(
+            weight_node.name
+        ):
+            new_weight_param = torch.nn.Parameter(
+                data=weight_3d.data.contiguous().unsqueeze(dim=-1), requires_grad=False
+            )
+            self.edge_program.state_dict[param_name] = new_weight_param
+        elif tensor_name := self.edge_program.graph_signature.inputs_to_lifted_tensor_constants.get(
+            weight_node.name
+        ):
+            self.edge_program.constants[tensor_name] = torch.unsqueeze(weight_3d, -1)
+        else:
+            RuntimeError("Weight of 1d conv should be constant tensor or Parameter obj")
+        weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(dim=-1)
+
     def call(self, graph_module: torch.fx.GraphModule):
         graph = graph_module.graph
         node_list = list(graph.nodes)
         for node in node_list:
-            if node.op == "call_function":
-                if node.target == exir_ops.edge.aten.convolution.default:
-                    stride = list(node.args[3])
-                    if len(stride) != 1:
-                        continue
+            if node.op != "call_function":
+                continue
+            if node.target != exir_ops.edge.aten.convolution.default:
+                continue
+            stride = list(node.args[3])
+            if len(stride) != 1:
+                continue
 
-                    # convert 3dim weight to 4dim
-                    weight_node = node.args[1]
-                    weight_3dim = get_param(self.edge_program, weight_node)
-                    weight_4dim = torch.nn.Parameter(
-                        data=weight_3dim.data.contiguous().unsqueeze(dim=-1),
-                        requires_grad=False,
-                    )
-                    parameter_name = (
-                        self.edge_program.graph_signature.inputs_to_parameters[
-                            weight_node.name
-                        ]
-                    )
-                    self.edge_program.state_dict[parameter_name] = weight_4dim
-                    weight_node.meta["val"] = weight_node.meta["val"].data.unsqueeze(
-                        dim=-1
-                    )
+            # convert 3dim weight to 4dim
+            weight_node = node.args[1]
+            self.update_kernel(weight_node)
 
-                    # Extend stride, padding, and dilation
-                    node.args = (
-                        node.args[0],
-                        node.args[1],
-                        node.args[2],
-                        node.args[3] + [1],  # stride
-                        node.args[4] + [0],  # padding
-                        node.args[5] + [1],  # dilation
-                        node.args[6],
-                        node.args[7],
-                        node.args[8],
-                    )
+            # Extend stride, padding, and dilation
+            node.args = (
+                node.args[0],
+                node.args[1],
+                node.args[2],
+                node.args[3] + [1],  # stride
+                node.args[4] + [0],  # padding
+                node.args[5] + [1],  # dilation
+                node.args[6],
+                node.args[7],
+                node.args[8],
+            )
+            # unsqueeze -> conv2d -> squeeze
 
-                    # unsqueeze -> conv2d -> squeeze
-                    with graph.inserting_before(node):
-                        input_node = node.args[0]
-                        unsqueeze_before = graph.create_node(
-                            "call_function", exir_ops.edge.aten.unsqueeze_copy.default
-                        )
-                        unsqueeze_before.args = (
-                            input_node,
-                            -1,
-                        )
-                        node.replace_input_with(input_node, unsqueeze_before)
+            with graph.inserting_before(node):
+                input_node = node.args[0]
+                prev_qparams = input_node.meta.get("quantize_attrs")
+                unsqueeze_before = graph.create_node(
+                    "call_function", exir_ops.edge.aten.unsqueeze_copy.default
+                )
+                unsqueeze_before.args = (
+                    input_node,
+                    -1,
+                )
+                node.replace_input_with(input_node, unsqueeze_before)
 
-                    with graph.inserting_after(node):
-                        squeeze_after = graph.create_node(
-                            "call_function", exir_ops.edge.aten.squeeze_copy.dims
-                        )
-                        squeeze_after.args = (
-                            node,
-                            [-1],
-                        )
-                        original_users = [
-                            user for user in node.users if user != squeeze_after
-                        ]
-                        for user in original_users:
-                            user.replace_input_with(node, squeeze_after)
+            with graph.inserting_after(node):
+                squeeze_after = graph.create_node(
+                    "call_function", exir_ops.edge.aten.squeeze_copy.dims
+                )
+                squeeze_after.args = (
+                    node,
+                    [-1],
+                )
+                original_users = [user for user in node.users if user != squeeze_after]
+                for user in original_users:
+                    user.replace_input_with(node, squeeze_after)
+            if quant_attr := node.meta.get("quantize_attrs"):
+                squeeze_after.meta["quantize_attrs"] = quant_attr
+            if prev_qparams is not None:
+                unsqueeze_before.meta["quantize_attrs"] = prev_qparams
 
         graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
+        _ = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/fold_qdq.py b/backends/samsung/_passes/fold_qdq.py
new file mode 100644
index 00000000000..c6f3699ece7
--- /dev/null
+++ b/backends/samsung/_passes/fold_qdq.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+class FoldQDQPass(ExportPass):
+    def __init__(self):
+        super().__init__()
+
+    def _fold(
+        self,
+        graph_module: GraphModule,
+    ):
+        for node in graph_module.graph.nodes:
+            if node.target not in (
+                *QuantConstants.QUANT_OPS_KEY_MAP.keys(),
+                *QuantConstants.DEQUANT_OPS_KEY_MAP.keys(),
+            ):
+                continue
+            for user in [user for user in node.users.keys()]:  # noqa: C416
+                user.replace_input_with(node, node.args[0])
+            graph_module.graph.erase_node(node)
+
+    def call(self, graph_module: GraphModule):
+        self._fold(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/fuse_conv_act.py b/backends/samsung/_passes/fuse_conv_act.py
new file mode 100644
index 00000000000..c034c98bb14
--- /dev/null
+++ b/backends/samsung/_passes/fuse_conv_act.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+def map_hardtan_relux(tanhnode: torch.fx.node.Node) -> Optional[str]:
+    assert (
+        tanhnode.target == exir_ops.edge.aten.hardtanh.default
+    ), "Must be a hardtanh node"
+    if not tanhnode.args[1] == 0.0:
+        return None
+    if tanhnode.args[2] == 6.0:
+        return "RELU6"
+    return None
+
+
+class FuseConvActPass(ExportPass):
+    TARGET_ACTS_MAP = {
+        exir_ops.edge.aten.relu.default: (lambda x: "RELU"),
+        exir_ops.edge.aten.relu_.default: (lambda x: "RELU"),
+        exir_ops.edge.aten.relu6.default: (lambda x: "RELU6"),
+        exir_ops.edge.aten.relu6_.default: (lambda x: "RELU6"),
+        exir_ops.edge.aten.hardtanh.default: map_hardtan_relux,
+        exir_ops.edge.aten.hardtanh_.default: map_hardtan_relux,
+    }
+
+    def _fuse(
+        self,
+        graph_module: GraphModule,
+    ):
+        for target_conv, target_act in self.get_target_conv_act(graph_module):
+            assert (
+                act_name := self.TARGET_ACTS_MAP.get(target_act.target)(target_act)
+            ), f"Not supported {target_act.name} now."
+            target_conv.meta["activation"] = act_name
+            if "quantize_attrs" in target_act.meta:
+                target_conv.meta["quantize_attrs"] = target_act.meta["quantize_attrs"]
+
+            # If we merge the real out activation to conv, the conv should be the real out
+            if "real_out" in target_act.meta:
+                target_conv.meta["real_out"] = target_act.meta["real_out"]
+            for user in [user for user in target_act.users.keys()]:  # noqa: C416
+                user.replace_input_with(target_act, target_conv)
+            graph_module.graph.erase_node(target_act)
+
+    def get_target_conv_act(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            if node.target != exir_ops.edge.aten.convolution.default:
+                continue
+            if len(node.users) != 1:
+                # Such cases couldn't be conv + act
+                continue
+            act_node = list(node.users.keys())[0]
+            if act_node.target not in self.TARGET_ACTS_MAP:
+                continue
+            if "quantize_attrs" in node.meta:
+                # If the conv's output is quantized
+                # We do not fuse them
+                continue
+            yield node, act_node
+
+    def call(self, graph_module: GraphModule):
+        self._fuse(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/insert_qdq.py b/backends/samsung/_passes/insert_qdq.py
new file mode 100644
index 00000000000..a59b011ac4b
--- /dev/null
+++ b/backends/samsung/_passes/insert_qdq.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Any, Dict
+
+import torch
+from executorch.backends.samsung._passes.utils import none_quant_tensor_quant_meta
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
+
+
+class QType(Enum):
+    Quant = 0
+    Dequant = 1
+
+
+class InsertQDQPass(ExportPass):
+    QDQ_MAP = {
+        # per tensor
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        # per channel
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default: exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+    }
+
+    def __init__(self, edge_program: ExportedProgram):
+        super().__init__()
+        self.edge_program = edge_program
+
+    def _create_qdq_node(
+        self,
+        graph_module: GraphModule,
+        qtype: QType,
+        input_node: torch.fx.Node,
+        quant_attrs: Dict[str, Any],
+    ) -> torch.fx.Node:
+        assert (target := quant_attrs.get("target")), ""
+        new_node_args = [input_node]
+        new_node_meta_val = input_node.meta["val"]
+        new_node_quant_attrs = {}
+        if qtype == QType.Dequant:
+            target = self.QDQ_MAP[target]
+        else:
+            # For input node, we should set the val type as quant type
+            key = QuantConstants.QUANT_KEY.quant_dtype
+            new_node_meta_val = new_node_meta_val.to(quant_attrs[key])
+            new_node_quant_attrs.update(quant_attrs)
+
+        for arg in target._schema.arguments[1:]:
+            name = arg.name
+            if name == "out_dtype":
+                continue
+            if qtype == QType.Quant:
+                key = QuantConstants.QUANT_OPS_KEY_MAP[target].get(name, name)
+            else:
+                key = QuantConstants.DEQUANT_OPS_KEY_MAP[target].get(name, name)
+            arg_value = quant_attrs[key]
+            if isinstance(arg.type, torch.Tensor) and (
+                isinstance(arg_value, int) or isinstance(arg_value, float)
+            ):
+                arg_value = torch.Tensor(arg_value)
+            new_node_args.append(arg_value)
+
+        new_node = graph_module.graph.create_node(
+            "call_function", target, tuple(new_node_args)
+        )
+        if new_node_quant_attrs:
+            new_node.meta["quantize_attrs"] = new_node_quant_attrs
+        else:
+            new_node.meta["quantize_attrs"] = {
+                QuantConstants.QUANT_KEY.quant_dtype: torch.float32,
+                QuantConstants.QUANT_KEY.scale: [1.0],
+                QuantConstants.QUANT_KEY.zero_point: [0],
+            }
+        new_node.meta["val"] = new_node_meta_val
+        return new_node
+
+    def _add_dq_after(self, graph_module: GraphModule, node: torch.fx.Node):
+        if not (quant_attrs := node.meta.get("quantize_attrs")):
+            return
+        with graph_module.graph.inserting_after(node):
+            new_node = self._create_qdq_node(
+                graph_module, QType.Dequant, node, quant_attrs
+            )
+            users = [user for user in node.users.keys() if (user.op == "output")]
+            for user in users:
+                user.replace_input_with(node, new_node)
+
+    def _add_q_after(self, graph_module: GraphModule, node: torch.fx.Node):
+        # In node don't need quant attrs after insert new quantize node.
+        if not (quant_attrs := node.meta.pop("quantize_attrs", None)):
+            return
+        node.meta["quantize_attrs"] = none_quant_tensor_quant_meta()
+        with graph_module.graph.inserting_after(node):
+            users = list(node.users.keys())
+            new_node = self._create_qdq_node(
+                graph_module, QType.Quant, node, quant_attrs
+            )
+            for user in users:
+                if user.target not in QuantConstants.QUANT_OPS_KEY_MAP:
+                    user.replace_input_with(node, new_node)
+
+    def _add_q_before(
+        self,
+        graph_module: GraphModule,
+        node: torch.fx.Node,
+        from_node: torch.fx.Node,
+        quantize_attrs: Dict,
+    ):
+        with graph_module.graph.inserting_before(node):
+            new_quant_node = self._create_qdq_node(
+                graph_module, QType.Quant, from_node, quantize_attrs
+            )
+            node.replace_input_with(from_node, new_quant_node)
+        return new_quant_node
+
+    def _add_dq_before(
+        self,
+        graph_module: GraphModule,
+        node: torch.fx.Node,
+        from_node: torch.fx.Node,
+        quantize_attrs: Dict,
+    ):
+        with graph_module.graph.inserting_before(node):
+            new_dequant_node = self._create_qdq_node(
+                graph_module, QType.Dequant, from_node, quantize_attrs
+            )
+            node.replace_input_with(from_node, new_dequant_node)
+        return new_dequant_node
+
+    def _add_qdq_for_requantize(self, graph_module: GraphModule):
+        for node in graph_module.graph.nodes:
+            requant_map: Dict[int, Dict] = node.meta.get("requantize")
+            if requant_map is None:
+                continue
+            assert (ori_quant_attrs := node.meta.get("quantize_attrs"))
+            usr_list = list(node.users.keys())
+            for user_idx, requant_params in requant_map.items():
+                user = usr_list[user_idx]
+                q_node = self._add_q_before(graph_module, user, node, requant_params)
+                _ = self._add_dq_before(graph_module, q_node, node, ori_quant_attrs)
+
+    def _add_qdq(self, graph_module: GraphModule):
+        for node in list(graph_module.graph.nodes):
+            if is_graph_input(self.edge_program, node):
+                self._add_q_after(graph_module, node)
+            elif is_graph_output(node):
+                self._add_dq_after(graph_module, node)
+
+    def call(self, graph_module: GraphModule):
+        self._add_qdq(graph_module)
+        self._add_qdq_for_requantize(graph_module)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/remove_useless_ops.py b/backends/samsung/_passes/remove_useless_ops.py
new file mode 100644
index 00000000000..c88a2d4a5d8
--- /dev/null
+++ b/backends/samsung/_passes/remove_useless_ops.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+from torch.fx import GraphModule
+
+
+class RemoveUselessOpPass(ExportPass):
+    # such ops should be single-in and single-out
+    USELESS_OP_SET = {
+        exir_ops.edge.aten._to_copy.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.alias.default,
+        exir_ops.edge.aten.lift_fresh_copy.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+    }
+
+    def __init__(self):
+        super().__init__()
+
+    def gen_pattern_as_strided_copy(self, graph_module: GraphModule):
+        for node in list(graph_module.graph.nodes):  # noqa: C416
+            if node.target != exir_ops.edge.aten.mean.dim:
+                continue
+            if len(node.users) != 1:
+                continue
+            successor = list(node.users.keys())[0]
+            if successor.target != exir_ops.edge.aten.as_strided_copy.default:
+                continue
+            is_pattern = True
+            count = 0
+            for i, stride in enumerate(successor.args[2]):
+                if stride < node.meta["val"].size()[i]:
+                    if stride == 1:
+                        count += 1
+                    else:
+                        is_pattern = False
+                        break
+                if count >= 2:
+                    is_pattern = False
+                    break
+            if is_pattern:
+                yield successor
+
+    def _fold_as_strided_copy(
+        self,
+        graph_module: GraphModule,
+    ):
+        for as_strided_copy_node in self.gen_pattern_as_strided_copy(graph_module):
+            for user in list(as_strided_copy_node.users.keys()):
+                user.replace_input_with(
+                    as_strided_copy_node, as_strided_copy_node.args[0]
+                )
+            graph_module.graph.erase_node(as_strided_copy_node)
+
+    def _remove_useless(
+        self,
+        graph_module: GraphModule,
+    ):
+        for node in graph_module.graph.nodes:
+            if node.target not in self.USELESS_OP_SET:
+                continue
+
+            # Prevent from removing if data type may change.
+            if (
+                node.target == exir_ops.edge.aten._to_copy.default
+                or node.target == exir_ops.edge.dim_order_ops._to_dim_order_copy.default
+            ) and "memory_format" not in node.kwargs:
+                continue
+
+            for user in [user for user in node.users.keys()]:  # noqa: C416
+                user.replace_input_with(node, node.all_input_nodes[0])
+            graph_module.graph.erase_node(node)
+        self._fold_as_strided_copy(graph_module)
+
+    def call(self, graph_module: GraphModule):
+        self._remove_useless(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        _ = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
diff --git a/backends/samsung/_passes/utils.py b/backends/samsung/_passes/utils.py
new file mode 100644
index 00000000000..afa7c72c601
--- /dev/null
+++ b/backends/samsung/_passes/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def none_quant_tensor_quant_meta():
+    return {
+        "quant_dtype": torch.float32,
+        "scales": 1,
+        "zero_points": 0,
+    }
diff --git a/backends/samsung/build.sh b/backends/samsung/build.sh
index dfa6407ff50..4845c760f0c 100755
--- a/backends/samsung/build.sh
+++ b/backends/samsung/build.sh
@@ -45,6 +45,7 @@ function build_x86_64() {
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
 	      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -S ${PROJECT_DIR} \
         -B ${X86_64_BUILD_DIR}
@@ -77,6 +78,7 @@ function build_android() {
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
 	      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
         -DEXECUTORCH_ENABLE_LOGGING=1 \
         -DEXECUTORCH_BUILD_DEVTOOLS=ON \
diff --git a/backends/samsung/builders/__init__.py b/backends/samsung/builders/__init__.py
index 02a457fd06e..978da82b370 100644
--- a/backends/samsung/builders/__init__.py
+++ b/backends/samsung/builders/__init__.py
@@ -14,11 +14,13 @@
     op_clamp,
     op_constant_pad_nd,
     op_conv2d,
+    op_dequantize,
     op_div,
     op_embedding,
     op_expand_copy,
     op_gelu,
     op_getitem,
+    op_hardsigmoid,
     op_hardswish,
     op_hardtanh,
     op_layer_norm,
@@ -32,6 +34,7 @@
     op_mul,
     op_permute,
     op_pixel_shuffle,
+    op_quantize,
     op_relu,
     op_reshape,
     op_rsqrt,
@@ -57,6 +60,7 @@
     op_clamp,
     op_conv2d,
     op_constant_pad_nd,
+    op_dequantize,
     op_div,
     op_embedding,
     op_expand_copy,
@@ -64,6 +68,7 @@
     op_getitem,
     op_hardswish,
     op_hardtanh,
+    op_hardsigmoid,
     op_layer_norm,
     op_leaky_relu,
     op_linear,
@@ -75,6 +80,7 @@
     op_mul,
     op_permute,
     op_pixel_shuffle,
+    op_quantize,
     op_relu,
     op_reshape,
     op_rsqrt,
diff --git a/backends/samsung/builders/node_visitor.py b/backends/samsung/builders/node_visitor.py
index a35c0b4715d..0d2707da8f5 100644
--- a/backends/samsung/builders/node_visitor.py
+++ b/backends/samsung/builders/node_visitor.py
@@ -14,6 +14,7 @@
     get_tensor_type,
 )
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+from executorch.backends.samsung.utils.constants import QuantConstants
 from executorch.backends.transforms.utils import is_param_node
 from torch.export import ExportedProgram
 
@@ -61,18 +62,26 @@ def define_tensor(
 
         dims = [1] if len(tensor.size()) == 0 else list(tensor.size())
 
+        quant_attrs = node.meta.get("quantize_attrs")
         enn_tensor_id = enn_graph.define_tensor(
             node.name,
             dims,
             data_type,
             tensor_type.name,
             const_data,
+            quant_param=quant_attrs,
         )
         assert enn_tensor_id is not None
         vals_to_ids[node] = enn_tensor_id
 
         return enn_tensor_id
 
+    def _update_params_qdtype(self, node: torch.fx.Node, params: Dict):
+        if qdtype := node.meta.get("quantize_attrs", {}).get(
+            QuantConstants.QUANT_KEY.quant_dtype
+        ):
+            params["quant_dtype"] = EnnGraph._affine_meta_param(qdtype)
+
 
 _node_visitor_dict = {}
 
@@ -92,6 +101,7 @@ def register_node_visitor(visitor):
         raise TypeError(
             f"target of vistor should be str|Tuple[str]|List[str], not{type(visitor.target)}"
         )
+    return visitor
 
 
 def get_node_visitors(*args) -> Dict[str, NodeVisitor]:
diff --git a/backends/samsung/builders/op_add.py b/backends/samsung/builders/op_add.py
index 1b0dddb0d02..a6eb79897dd 100644
--- a/backends/samsung/builders/op_add.py
+++ b/backends/samsung/builders/op_add.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -28,9 +29,13 @@ def define_node(
     ) -> None:
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTSUM", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTSUM", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_avg_pool2d.py b/backends/samsung/builders/op_avg_pool2d.py
index ad7ccbac3ae..bfca8b89b22 100644
--- a/backends/samsung/builders/op_avg_pool2d.py
+++ b/backends/samsung/builders/op_avg_pool2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -49,6 +50,7 @@ def define_node(
         params["stride_w"] = stride[1]
         params["padding"] = "EXPLICIT"
         params["explicit_padding"] = explicit_padding
+        self._update_params_qdtype(node, params)
 
         if len(node.args) > 4:
             ceil_mode = cast(bool, node.args[4])
@@ -64,7 +66,5 @@ def define_node(
             assert (
                 divisor_override == kernel_size[0] * kernel_size[1]
             ), "Not supported divisor_override which is not equal to pooling region."
-
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
         enn_graph.define_op(node.name, "AVGPOOL2D", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_bmm.py b/backends/samsung/builders/op_bmm.py
index 6ba8864ebb3..13e0d19cb14 100644
--- a/backends/samsung/builders/op_bmm.py
+++ b/backends/samsung/builders/op_bmm.py
@@ -16,7 +16,7 @@
 
 @register_node_visitor
 class BMMVisitor(NodeVisitor):
-    target = "aten.bmm.default"
+    target = ["aten.bmm.default"]
 
     def __init__(self, *args) -> None:
         super().__init__(*args)
@@ -29,12 +29,15 @@ def define_node(
     ) -> None:
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
 
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
+        params = {}
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(
-            node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id]
+            node.name, "BATCH_MATMUL", [input_id_1, input_id_2], [output_id], params
         )
diff --git a/backends/samsung/builders/op_cat.py b/backends/samsung/builders/op_cat.py
index e9c0a32b389..09387f2e361 100644
--- a/backends/samsung/builders/op_cat.py
+++ b/backends/samsung/builders/op_cat.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -12,6 +13,7 @@
 )
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
 from executorch.backends.transforms import get_shape
+from executorch.backends.transforms.utils import is_param_node
 
 
 @register_node_visitor
@@ -29,14 +31,20 @@ def define_node(
     ) -> None:
         tensors = cast(List[torch.fx.Node], node.args[0])
         input_tensor_ids = []
-
-        for in_tensor in tensors:
+        constant_idx = None
+        for idx, in_tensor in enumerate(tensors):
+            if is_param_node(self.exported_program, in_tensor):
+                assert constant_idx is None, "Only support at most 1 constant tensor"
+                constant_idx = idx
             input_id = self.define_tensor(in_tensor, enn_graph, vals_to_ids)
             input_tensor_ids.append(input_id)
 
         in_shape = get_shape(node)
         axis = cast(int, node.args[1]) % len(in_shape) if len(node.args) >= 2 else 0
         params = {"axis": axis}
+        if constant_idx is not None:
+            params["constant_index"] = constant_idx
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
         enn_graph.define_op(node.name, "CONCAT", input_tensor_ids, [output_id], params)
diff --git a/backends/samsung/builders/op_clamp.py b/backends/samsung/builders/op_clamp.py
index c5670b80fa3..74af83212a5 100644
--- a/backends/samsung/builders/op_clamp.py
+++ b/backends/samsung/builders/op_clamp.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict
 
 import torch
@@ -32,12 +33,15 @@ def define_node(
         # The default value of lower bound and upper bound
         output_min = torch.finfo(torch.float32).min
         output_max = torch.finfo(torch.float32).max
+
         if node.args[1] is not None:
             output_min = cast(float, node.args[1])
         if len(node.args) > 2 and node.args[2] is not None:
             output_max = cast(float, node.args[2])
 
         params = {"minimum": output_min, "maximum": output_max}
+        self._update_params_qdtype(node, params)
+
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
         enn_graph.define_op(node.name, "CLIP", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_conv2d.py b/backends/samsung/builders/op_conv2d.py
index 881a533801f..ab77d8df626 100644
--- a/backends/samsung/builders/op_conv2d.py
+++ b/backends/samsung/builders/op_conv2d.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -56,6 +57,9 @@ def define_node(
         input_shape = get_shape(input)
         kernel_shape = get_shape(weight_node)
         params = {}
+        self._update_params_qdtype(node, params)
+        if "activation" in node.meta:
+            params["activation"] = node.meta["activation"]
         params["kernel_h"] = kernel_shape[2]
         params["kernel_w"] = kernel_shape[3]
         params["stride_h"] = stride[0]
diff --git a/backends/samsung/builders/op_dequantize.py b/backends/samsung/builders/op_dequantize.py
new file mode 100644
index 00000000000..a1c31af4037
--- /dev/null
+++ b/backends/samsung/builders/op_dequantize.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.samsung.builders.node_visitor import register_node_visitor
+from executorch.backends.samsung.builders.op_quantize import _QuantOpVistorBase
+
+
+# Dequant ops here
+@register_node_visitor
+class DequantizeVistor(_QuantOpVistorBase):
+    target = [
+        "quantized_decomposed.dequantize_per_tensor.default",
+        "quantized_decomposed.dequantize_per_tensor.tensor",
+        "quantized_decomposed.dequantize_per_channel.default",
+        "quantized_decomposed.dequantize_per_channel.tensor",
+    ]
diff --git a/backends/samsung/builders/op_div.py b/backends/samsung/builders/op_div.py
index 89d773ddb0e..8b0e7cdd5af 100644
--- a/backends/samsung/builders/op_div.py
+++ b/backends/samsung/builders/op_div.py
@@ -27,13 +27,16 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
-        # inputs
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
-
+        params = {}
+        self._update_params_qdtype(node, params)
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTDIV", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTDIV", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_gelu.py b/backends/samsung/builders/op_gelu.py
index 059a3b77850..88417f688f9 100644
--- a/backends/samsung/builders/op_gelu.py
+++ b/backends/samsung/builders/op_gelu.py
@@ -27,8 +27,14 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
-        input_id = self.define_tensor(node.args[0], enn_graph, vals_to_ids)
+        # input1
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
+        # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "GELU", [input_id], [output_id])
+        params = {}
+        self._update_params_qdtype(node, params)
+
+        enn_graph.define_op(node.name, "GELU", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardsigmoid.py b/backends/samsung/builders/op_hardsigmoid.py
new file mode 100644
index 00000000000..3a50d65da41
--- /dev/null
+++ b/backends/samsung/builders/op_hardsigmoid.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.samsung.builders.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+
+
+@register_node_visitor
+class HardSigmoidVisitor(NodeVisitor):
+    target = "aten.hardsigmoid.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        enn_graph: EnnGraph,
+        vals_to_ids: Dict[torch.Tensor, int],
+    ) -> None:
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
+        output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
+        enn_graph.define_op(node.name, "HardSigmoid", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardswish.py b/backends/samsung/builders/op_hardswish.py
index 72a99d17b83..8c30125e8a4 100644
--- a/backends/samsung/builders/op_hardswish.py
+++ b/backends/samsung/builders/op_hardswish.py
@@ -29,7 +29,7 @@ def define_node(
     ) -> None:
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
-
+        params = {}
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
-        enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id])
+        enn_graph.define_op(node.name, "HARDSWISH", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_hardtanh.py b/backends/samsung/builders/op_hardtanh.py
index 4f667bf5299..7d65e97a566 100644
--- a/backends/samsung/builders/op_hardtanh.py
+++ b/backends/samsung/builders/op_hardtanh.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict
 
 import torch
@@ -29,9 +30,12 @@ def define_node(
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
+        # default value of output_min and output_max
         output_min = cast(float, node.args[1]) if len(node.args) > 1 else -1
         output_max = cast(float, node.args[2]) if len(node.args) > 2 else 1
+
         params = {"minimum": output_min, "maximum": output_max}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
diff --git a/backends/samsung/builders/op_layer_norm.py b/backends/samsung/builders/op_layer_norm.py
index e6f853178d8..098bc92dc84 100644
--- a/backends/samsung/builders/op_layer_norm.py
+++ b/backends/samsung/builders/op_layer_norm.py
@@ -46,9 +46,8 @@ def define_node(
 
         epsilon = node.args[4] if len(node.args) > 4 else 1e-5
         params = {"epsilon": epsilon}
-
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
-
         enn_graph.define_op(
             node.name, "LAYERNORM", all_input_tensors, [output_id], params
         )
diff --git a/backends/samsung/builders/op_linear.py b/backends/samsung/builders/op_linear.py
index 2f7aa1e6415..720439de976 100644
--- a/backends/samsung/builders/op_linear.py
+++ b/backends/samsung/builders/op_linear.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -43,6 +44,7 @@ def define_node(
 
         weight_shape = get_shape(weight_node)
         params = {"in_channels": weight_shape[1], "out_channels": weight_shape[0]}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
diff --git a/backends/samsung/builders/op_max_pool2d.py b/backends/samsung/builders/op_max_pool2d.py
index d386dd30b1a..57b716fcb34 100644
--- a/backends/samsung/builders/op_max_pool2d.py
+++ b/backends/samsung/builders/op_max_pool2d.py
@@ -73,6 +73,7 @@ def define_node(
         params["explicit_padding"] = explicit_padding
         params["dilation_h"] = dilation[0]
         params["dilation_w"] = dilation[1]
+        self._update_params_qdtype(node, params)
 
         if len(node.args) > 5:
             ceil_mode = cast(bool, node.args[5])
diff --git a/backends/samsung/builders/op_mean_dim.py b/backends/samsung/builders/op_mean_dim.py
index 2f07f870ec4..3d0377703a7 100644
--- a/backends/samsung/builders/op_mean_dim.py
+++ b/backends/samsung/builders/op_mean_dim.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import cast, Dict, List
 
 import torch
@@ -27,6 +28,7 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
+        # input
         input = node.args[0]
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
@@ -37,8 +39,11 @@ def define_node(
         in_shape = get_shape(input)
         for dim in dims:
             reduce_axes.append(dim % len(in_shape))
-        reduce_axes.sort()
+
+        if len(node.args[1]) > 1:
+            reduce_axes.sort()
 
         keep_dim = node.args[2] if len(node.args) >= 3 else False
         params = {"keep_dims": keep_dim, "axis": reduce_axes}
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(node.name, "REDUCEMEAN", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_mul.py b/backends/samsung/builders/op_mul.py
index dce531ff0b0..6dd7c0dd9f0 100644
--- a/backends/samsung/builders/op_mul.py
+++ b/backends/samsung/builders/op_mul.py
@@ -1,5 +1,9 @@
-# Copyright (c) 2024 Samsung Electronics Co. LTD
+# Copyright (c) 2025 Samsung Electronics Co. LTD
 # All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -23,11 +27,17 @@ def define_node(
         enn_graph: EnnGraph,
         vals_to_ids: Dict[torch.Tensor, int],
     ) -> None:
+
         input1 = node.args[0]
         input_id_1 = self.define_tensor(input1, enn_graph, vals_to_ids)
+
         input2 = node.args[1]
         input_id_2 = self.define_tensor(input2, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "ELTMUL", [input_id_1, input_id_2], [output_id])
+        enn_graph.define_op(
+            node.name, "ELTMUL", [input_id_1, input_id_2], [output_id], params
+        )
diff --git a/backends/samsung/builders/op_quantize.py b/backends/samsung/builders/op_quantize.py
new file mode 100644
index 00000000000..dcf30e291f9
--- /dev/null
+++ b/backends/samsung/builders/op_quantize.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.samsung.builders.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
+from executorch.backends.samsung.utils.constants import QuantConstants
+
+
+class _QuantOpVistorBase(NodeVisitor):
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        enn_graph: EnnGraph,
+        vals_to_ids: Dict[torch.Tensor, int],
+    ) -> None:
+        # input
+        input = node.args[0]
+        input_id = self.define_tensor(input, enn_graph, vals_to_ids)
+
+        scales = node.args[1]
+        if isinstance(scales, torch.Tensor):
+            scales = scales.tolist()
+        elif not isinstance(scales, list):
+            scales = torch.tensor(scales).reshape([1]).tolist()
+        zero_points = node.args[2]
+        if isinstance(zero_points, torch.Tensor):
+            zero_points = zero_points.tolist()
+        elif not isinstance(zero_points, list):
+            zero_points = torch.tensor(zero_points).reshape([1]).tolist()
+
+        output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+
+        params = {"scales": scales, "zero_points": zero_points}
+
+        if node.target in QuantConstants.QUANT_OPS_KEY_MAP:
+            enn_graph.define_op(node.name, "QUANTIZE", [input_id], [output_id], params)
+        else:
+            enn_graph.define_op(
+                node.name, "DEQUANTIZE", [input_id], [output_id], params
+            )
+
+
+@register_node_visitor
+class QuantizeVistor(_QuantOpVistorBase):
+    target = [
+        "quantized_decomposed.quantize_per_tensor.default",
+        "quantized_decomposed.quantize_per_channel.default",
+    ]
diff --git a/backends/samsung/builders/op_relu.py b/backends/samsung/builders/op_relu.py
index ba90116be1d..a4a2b6bc4f0 100644
--- a/backends/samsung/builders/op_relu.py
+++ b/backends/samsung/builders/op_relu.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+
 from typing import Dict
 
 import torch
@@ -30,5 +31,7 @@ def define_node(
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        self._update_params_qdtype(node, params)
 
-        enn_graph.define_op(node.name, "RELU", [input_id], [output_id])
+        enn_graph.define_op(node.name, "RELU", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_softmax.py b/backends/samsung/builders/op_softmax.py
index 1e2e4a378dc..7f569cea6fc 100644
--- a/backends/samsung/builders/op_softmax.py
+++ b/backends/samsung/builders/op_softmax.py
@@ -35,5 +35,5 @@ def define_node(
 
         axis = cast(int, node.args[1])
         params = {"axis": axis}
-
+        self._update_params_qdtype(node, params)
         enn_graph.define_op(node.name, "SOFTMAX", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_squeeze.py b/backends/samsung/builders/op_squeeze.py
index d165a22fcb3..82fa17fbc95 100644
--- a/backends/samsung/builders/op_squeeze.py
+++ b/backends/samsung/builders/op_squeeze.py
@@ -33,4 +33,5 @@ def define_node(
         # output
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id])
+        params = {"new_shape": [*node.meta["val"].shape]}
+        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_to_copy.py b/backends/samsung/builders/op_to_copy.py
index 545672ef6a3..c770602bb5f 100644
--- a/backends/samsung/builders/op_to_copy.py
+++ b/backends/samsung/builders/op_to_copy.py
@@ -11,6 +11,8 @@
     NodeVisitor,
     register_node_visitor,
 )
+
+from executorch.backends.samsung.builders.utils import get_map_dtype, get_tensor
 from executorch.backends.samsung.serialization.enn_graph_schema import EnnGraph
 
 
@@ -35,5 +37,8 @@ def define_node(
         input_id = self.define_tensor(input, enn_graph, vals_to_ids)
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
+        params = {}
+        out_tensor = get_tensor(self.exported_program, node)
+        params["out_dtype"] = get_map_dtype(out_tensor.dtype)
 
-        enn_graph.define_op(node.name, "CAST", [input_id], [output_id])
+        enn_graph.define_op(node.name, "CAST", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_unsqueeze.py b/backends/samsung/builders/op_unsqueeze.py
index 942c3307de7..61fa06e6310 100644
--- a/backends/samsung/builders/op_unsqueeze.py
+++ b/backends/samsung/builders/op_unsqueeze.py
@@ -31,4 +31,5 @@ def define_node(
 
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
 
-        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id])
+        params = {"new_shape": [*node.meta["val"].shape]}
+        enn_graph.define_op(node.name, "RESHAPE", [input_id], [output_id], params)
diff --git a/backends/samsung/builders/op_upsample_bilinear2d.py b/backends/samsung/builders/op_upsample_bilinear2d.py
index a934b2789ba..d4b040460e3 100644
--- a/backends/samsung/builders/op_upsample_bilinear2d.py
+++ b/backends/samsung/builders/op_upsample_bilinear2d.py
@@ -46,6 +46,7 @@ def define_node(
             "upsampling_factor": scale_factor,
             "half_pixel_centers": True,
         }
+        self._update_params_qdtype(node, params)
         output_id = self.define_tensor(node, enn_graph, vals_to_ids)
         enn_graph.define_op(
             node.name, "RESIZE_BILINEAR", [input_id], [output_id], params
diff --git a/backends/samsung/builders/utils.py b/backends/samsung/builders/utils.py
index 58c84ff6d31..a640071c798 100644
--- a/backends/samsung/builders/utils.py
+++ b/backends/samsung/builders/utils.py
@@ -9,7 +9,6 @@
 import torch
 from executorch.backends.samsung.utils.utils import is_graph_input, is_graph_output
 from executorch.backends.transforms.utils import get_param_tensor, is_param_node
-
 from torch.export import ExportedProgram
 
 DATA_TYPE_STR_MAPPING = {
diff --git a/backends/samsung/enn_preprocess.py b/backends/samsung/enn_preprocess.py
index dde01bc09c7..0847ec0adeb 100644
--- a/backends/samsung/enn_preprocess.py
+++ b/backends/samsung/enn_preprocess.py
@@ -9,10 +9,16 @@
 
 import executorch.backends.samsung.python.PyEnnWrapperAdaptor as PyEnnWrapper
 import torch
+from executorch.backends.samsung._passes.annotate_qparams import AnnotateQparamsPass
+from executorch.backends.samsung._passes.annotate_scalar_parameters import (
+    AnnotateScalarParametersPass,
+)
 from executorch.backends.samsung._passes.conv1d_to_conv2d import Conv1dToConv2d
 from executorch.backends.samsung._passes.customized_constant_prop import (
     ConstantPropPass,
 )
+from executorch.backends.samsung._passes.fold_qdq import FoldQDQPass
+from executorch.backends.samsung._passes.insert_qdq import InsertQDQPass
 from executorch.backends.samsung._passes.replace_scalar_ops import ReplaceOpsWithScalar
 from executorch.backends.samsung.builders.node_visitor import get_node_visitors
 from executorch.backends.samsung.serialization.compile_options import (
@@ -53,12 +59,16 @@ def preprocess(
 
         enn_preprocess_passes = PassManager(
             passes=[
+                AnnotateQparamsPass(edge_program),
+                FoldQDQPass(),
                 ConstantPropPass(edge_program),
                 Conv1dToConv2d(edge_program),
                 FuseBatchNormWithConvPass(edge_program),
                 AddmmToLinearTransform(),
                 ReplaceOpsWithScalar(),
                 RemoveGetItemPass(),
+                InsertQDQPass(edge_program),
+                AnnotateScalarParametersPass(edge_program),
             ]
         )
         pass_result = enn_preprocess_passes(edge_program.graph_module)
diff --git a/backends/samsung/partition/enn_partitioner.py b/backends/samsung/partition/enn_partitioner.py
index 952cb000429..368d069c380 100644
--- a/backends/samsung/partition/enn_partitioner.py
+++ b/backends/samsung/partition/enn_partitioner.py
@@ -129,5 +129,6 @@ def ops_to_not_decompose(
             torch.ops.aten.prelu.default,
             torch.ops.aten.layer_norm.default,
             torch.ops.aten.pixel_shuffle.default,
+            torch.ops.aten.hardsigmoid.default,
         ]
         return (ops_not_to_decompose, None)
diff --git a/backends/samsung/quantizer/__init__.py b/backends/samsung/quantizer/__init__.py
new file mode 100644
index 00000000000..621eec69240
--- /dev/null
+++ b/backends/samsung/quantizer/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .qconfig import Precision
+from .quantizer import EnnQuantizer
+
+__all__ = [EnnQuantizer, Precision]
diff --git a/backends/samsung/quantizer/annotator.py b/backends/samsung/quantizer/annotator.py
new file mode 100644
index 00000000000..31015698006
--- /dev/null
+++ b/backends/samsung/quantizer/annotator.py
@@ -0,0 +1,871 @@
+# Copyright (c) Qualcomm Innovation Center, Inc
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Dict, List
+
+import torch
+from torch._ops import OpOverload
+from torch._subclasses import FakeTensor
+
+from torch.fx import Graph, Node
+
+from torchao.quantization.pt2e import FixedQParamsObserver
+from torchao.quantization.pt2e.quantizer import (
+    annotate_output_qspec,
+    QuantizationAnnotation,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+
+from .qconfig import QuantizationConfig
+
+OP_ANNOTATOR: Dict[OpOverload, Callable] = {}
+
+ADD_OPS = [
+    torch.ops.aten.add,
+    torch.ops.aten.add.Tensor,
+    torch.ops.aten.add_.Tensor,
+]
+
+
+def register_annotator(ops: List[OpOverload]):
+    def decorator(annotator: Callable):
+        for op in ops:
+            OP_ANNOTATOR[op] = annotator
+
+    return decorator
+
+
+def annotate(graph: Graph, quant_config: QuantizationConfig) -> None:
+    # Pattern annotation
+    _annotate_fused_activation_pattern(graph, quant_config)
+
+    # Per-op annotation
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            annotate_placeholder(node, quant_config)
+        elif node.op == "call_function":
+            annotate_func = OP_ANNOTATOR.get(node.target, None)
+            if annotate_func is not None:
+                annotate_func(node, quant_config)
+
+
+def _is_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    return True if any of the node
+    is annotated, otherwise return False
+    """
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _is_fake_tensor(node: Node):
+    if (
+        isinstance(node, Node)
+        and "val" in node.meta
+        and isinstance(node.meta["val"], FakeTensor)
+    ):
+        return True
+    return False
+
+
+def _is_float_tensor(node: Node):
+    """Check if the node's tensor is a float tensor,
+    so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if not _is_fake_tensor(node):
+        return False
+    return node.meta["val"].dtype in [torch.float32, torch.float16]
+
+
+def _mark_nodes_as_annotated(nodes: List[Node]):
+    for node in nodes:
+        if "quantization_annotation" not in node.meta:
+            node.meta["quantization_annotation"] = QuantizationAnnotation()
+        node.meta["quantization_annotation"]._annotated = True
+
+
+# for nodes whose targets ars placehold (not call_function)
+def annotate_placeholder(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    if _is_float_tensor(node):
+        annotate_output_qspec(node, quant_config.output_activation)
+
+    _mark_nodes_as_annotated([node])
+
+
+# CASE 1: fused_activation case (ex. Conv2D + ReLU)
+def _is_hardtanh_for_relux(relu_node: torch.fx.node.Node):
+    if relu_node.target in [
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+    ]:
+        # checking if hardtanh is convertable to ReLU6
+        # ReLU1 is not supported now
+        if not relu_node.args[1] == 0.0:
+            return False
+        if relu_node.args[2] == 6.0:  # for ReLU6
+            return True
+    return True
+
+
+def _annotate_fused_activation_pattern(
+    graph: Graph, quant_config: QuantizationConfig
+) -> None:
+    for relu_node in graph.nodes:
+        # Check relu/relu6 node
+        if relu_node.op != "call_function":
+            continue
+        if relu_node.target not in [
+            # The strategy of ReLU and ReLU6 is fold_activation in ENNQuant
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+            torch.ops.aten.relu6.default,
+            torch.ops.aten.relu6_.default,
+            torch.ops.aten.hardtanh.default,
+            torch.ops.aten.hardtanh_.default,
+        ]:
+            continue
+
+        if not _is_hardtanh_for_relux(relu_node):
+            continue
+
+        producer_node = relu_node.args[0]
+        if not isinstance(producer_node, Node):
+            continue
+        if producer_node.op != "call_function":
+            continue
+        if len(producer_node.users) != 1:
+            continue
+
+        # Handle affine + relu fusion
+        if producer_node.target in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+            torch.ops.aten.linear.default,
+        ]:
+            # input & weight (or bias) setting for Conv node(producer_node)
+            quantization_annotation = producer_node.meta.get(
+                "quantization_annotation", QuantizationAnnotation()
+            )
+            if quantization_annotation.input_qspec_map is None:
+                quantization_annotation.input_qspec_map = {}
+
+            input = producer_node.args[0]
+            quantization_annotation.input_qspec_map[input] = (
+                quant_config.input_activation
+            )
+
+            quantization_annotation.input_qspec_map[producer_node.args[1]] = (
+                quant_config.weight
+            )
+            if len(producer_node.args) > 2 and quant_config.bias is not None:
+                quantization_annotation.input_qspec_map[producer_node.args[2]] = (
+                    quant_config.bias
+                )
+
+            producer_node.meta["quantization_annotation"] = quantization_annotation
+            producer_node.meta["quantization_annotation"]._annotated = True
+            # out setting for activation node (relu_node)
+            quantization_annotation = relu_node.meta.get(
+                "quantization_annotation", QuantizationAnnotation()
+            )
+            quantization_annotation.output_qspec = quant_config.output_activation
+
+            relu_node.meta["quantization_annotation"] = quantization_annotation
+            relu_node.meta["quantization_annotation"]._annotated = True
+            continue
+
+
+# CASE 2-1: two input case without Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.div,
+        torch.ops.aten.div.Tensor,
+        torch.ops.aten.divide.Tensor,
+        torch.ops.aten.matmul.default,
+        torch.ops.aten.bmm.default,
+        torch.ops.aten.sum.dim_IntList,
+    ]
+)
+def annotate_2in1out(node: Node, quant_config: QuantizationConfig) -> None:
+    input_act0 = node.args[0]
+    input_act1 = node.args[1]
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input_act0):
+        return
+
+    input_act_qspec = quant_config.input_activation
+    output_act_qspec = (
+        quant_config.output_activation if _is_float_tensor(node) else None
+    )
+
+    input_qspec_map = {}
+    if _is_float_tensor(input_act0):
+        input_qspec_map[input_act0] = input_act_qspec
+
+    if _is_float_tensor(input_act1):
+        input_qspec_map[input_act1] = input_act_qspec
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_act_qspec,
+        _annotated=True,
+    )
+
+
+# getting QuantAnnot though the first input
+def _get_quantization_annotation(node: Node):
+    if node.op == "placeholder":
+        return False
+    elif "quantization_annotation" in node.meta:
+        return node
+    elif node.args == ():
+        return False
+    elif isinstance(node.args[0], Node):
+        return _get_quantization_annotation(node.args[0])
+    elif isinstance(node.args[0], list):
+        # for cat, concatenate and stack
+        if isinstance(node.args[0][0], Node):
+            return _get_quantization_annotation(node.args[0][0])
+        else:
+            return False
+    else:
+        return False
+
+
+# CASE 2-2: two input case with Shared Quant
+# ops.add / ops.add_ are processed by another annotator
+@register_annotator(
+    [
+        torch.ops.aten.sub,
+        torch.ops.aten.mul,
+        torch.ops.aten.sub.Tensor,
+        torch.ops.aten.mul.Tensor,
+        torch.ops.aten.sub_.Tensor,
+        torch.ops.aten.mul_.Tensor,
+        torch.ops.aten.rsub.Scalar,
+        torch.ops.aten.mul.Scalar,
+    ]
+)
+def annotate_2in1out_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+
+    input_qspec_map = {}
+    input0 = node.args[0]
+    input1 = node.args[1]
+
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input0):
+        return
+    if (
+        isinstance(input0, Node)
+        and isinstance(input1, float)
+        and not _get_quantization_annotation(input0)
+    ):
+        return
+    if (
+        isinstance(input0, float)
+        and isinstance(input1, Node)
+        and not _get_quantization_annotation(input1)
+    ):
+        return
+    if isinstance(input0, Node) and isinstance(input1, Node):
+        shared_qspec = SharedQuantizationSpec((input0, node))
+        input_qspec_map[input0] = quant_config.input_activation
+        input_qspec_map[input1] = shared_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+    else:
+        input_act_qspec = quant_config.input_activation
+        output_act_qspec = (
+            quant_config.output_activation if _is_float_tensor(node) else None
+        )
+
+        input_qspec_map = {}
+        input_act0 = node.args[0]
+        if _is_float_tensor(input_act0):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = node.args[1]
+        if _is_float_tensor(input_act1):
+            input_qspec_map[input_act1] = input_act_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 2-3: only for add ops
+@register_annotator(ADD_OPS)
+def annotate_add_ops_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+
+    input_qspec_map = {}
+    input0 = node.args[0]
+    input1 = node.args[1]
+
+    # skipping quantization if 1st input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(input0):
+        return
+
+    if isinstance(input0, Node) and isinstance(input1, Node):
+        NonQuantShare_ops_for_add = [torch.ops.aten.dropout.default] + ADD_OPS
+        if (
+            input0.op == "call_function" and input0.target in NonQuantShare_ops_for_add
+        ) or (
+            input1.op == "call_function" and input1.target in NonQuantShare_ops_for_add
+        ):
+            input_act_qspec = quant_config.input_activation
+            output_act_qspec = (
+                quant_config.output_activation if _is_float_tensor(node) else None
+            )
+
+            input_qspec_map = {}
+            input_act0 = node.args[0]
+            if _is_float_tensor(input_act0):
+                input_qspec_map[input_act0] = input_act_qspec
+
+            input_act1 = node.args[1]
+            if _is_float_tensor(input_act1):
+                input_qspec_map[input_act1] = input_act_qspec
+
+            node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=output_act_qspec,
+                _annotated=True,
+            )
+        else:
+            shared_qspec = SharedQuantizationSpec((input0, node))
+            input_qspec_map[input0] = quant_config.input_activation
+            input_qspec_map[input1] = shared_qspec
+
+            node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=shared_qspec,
+                _annotated=True,
+            )
+    elif (
+        isinstance(input0, Node)
+        and isinstance(input1, float)
+        and not _get_quantization_annotation(input0)
+    ):
+        pass
+    elif (
+        isinstance(input0, float)
+        and isinstance(input1, Node)
+        and not _get_quantization_annotation(input1)
+    ):
+        pass
+    else:
+        input_act_qspec = quant_config.input_activation
+        output_act_qspec = (
+            quant_config.output_activation if _is_float_tensor(node) else None
+        )
+
+        input_qspec_map = {}
+        input_act0 = node.args[0]
+        if _is_float_tensor(input_act0):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = node.args[1]
+        if _is_float_tensor(input_act1):
+            input_qspec_map[input_act1] = input_act_qspec
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 3-1: Single input + Single Out case without Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.ceil.default,
+        torch.ops.aten.clamp.default,
+        torch.ops.aten.relu.default,
+        torch.ops.aten.relu_.default,
+        torch.ops.aten.relu6.default,
+        torch.ops.aten.relu6_.default,
+        torch.ops.aten.cos.default,
+        torch.ops.aten.sin.default,
+        torch.ops.aten.tanh.default,
+        torch.ops.aten.hardswish.default,
+        torch.ops.aten.hardswish_.default,
+        torch.ops.aten.hardsigmoid.default,
+        torch.ops.aten.hardsigmoid_.default,
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.mean.default,
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.avg_pool2d.default,
+        torch.ops.aten.leaky_relu.default,
+        torch.ops.aten.leaky_relu_.default,
+        torch.ops.aten.prelu.default,
+        torch.ops.aten.upsample_bilinear2d.vec,
+        torch.ops.aten.upsample_nearest2d.vec,
+        torch.ops.aten.mean.dim,
+        torch.ops.aten.sqrt.default,
+        torch.ops.aten.gelu.default,
+        torch.ops.aten.scaled_dot_product_attention.default,
+        torch.ops.aten.rsqrt.default,
+        torch.ops.aten.pow.Tensor_Scalar,
+        torch.ops.aten.topk.default,
+    ]
+)
+def annotate_1in1out(node: Node, quant_config: QuantizationConfig) -> None:
+    # skipping quantization if input is not float.
+    if _is_annotated([node]) or not _is_float_tensor(node.args[0]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    # one inputs + one output case.
+    input_act_qspec = quant_config.input_activation
+    quantization_annotation.input_qspec_map[node.args[0]] = input_act_qspec
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 3-2: Single input + Single Out case with Shared Quant
+@register_annotator(
+    [
+        torch.ops.aten.permute.default,
+        torch.ops.aten.view.default,
+        torch.ops.aten._unsafe_view.default,
+        torch.ops.aten.squeeze.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze_copy.dims,
+        torch.ops.aten.unsqueeze.default,
+        torch.ops.aten.unsqueeze_copy.default,
+        torch.ops.aten.transpose.int,
+        torch.ops.aten.expand.default,
+        torch.ops.aten.max_pool2d.default,
+        torch.ops.aten.max_pool2d_with_indices.default,
+        torch.ops.aten.reshape.default,
+        torch.ops.aten.select.int,
+        torch.ops.aten.flatten.using_ints,
+        torch.ops.aten.pad.default,
+        torch.ops.aten.slice.Tensor,
+        torch.ops.aten.to.dtype,
+    ]
+)
+def annotate_1in1out_with_SharedQuant(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    shared_qspec = SharedQuantizationSpec((input, node))
+
+    # get QuantAnnot from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node)
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+    else:
+        # if no QuantAnnot in the input path
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 3-3: Single input + Single Out case with FP
+@register_annotator(
+    [
+        torch.ops.aten.softmax.int,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten._safe_softmax.default,
+        torch.ops.aten.log_softmax.int,
+    ]
+)
+def annotate_1in1out_with_SharedQuant_for_FP(
+    node: Node, quant_config: QuantizationConfig
+) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    if input.target in ADD_OPS and _is_annotated([input]):
+        del input.meta["quantization_annotation"]
+
+    # get QuantAnnot from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        # if QuantAnnot in the input path, input_qspec is shared, but output_qspec is not.
+        input_qspec_map[shared_quant_node] = SharedQuantizationSpec(shared_quant_node)
+
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quant_config.output_activation,
+            _annotated=True,
+        )
+    else:
+        # if no QuantAnnot in the input path
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=quant_config.output_activation,
+            _annotated=True,
+        )
+
+
+# CASE 4: One value input + one index input with Shared Quant
+@register_annotator([torch.ops.aten.index.Tensor])
+def annotate_index(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnt from the input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+        input_qspec_map[input] = quant_config.input_activation
+
+        # sharing QuantAnnot with the parent
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 5 input + index + value & output with Shared Quant
+@register_annotator(
+    [torch.ops.aten.index_put.default, torch.ops.aten.index_put_.default]
+)
+def annotate_index_put(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]  # from KVCache in LLAMA
+    value = node.args[2]  # from linear projection layer
+    assert isinstance(input, Node)
+    assert isinstance(value, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnot from input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+        input_qspec_map[input] = shared_qspec
+        input_qspec_map[value] = shared_qspec
+        output_qspec = shared_qspec
+    else:
+        # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config.
+        input_qspec_map[input] = quant_config.input_activation
+        input_qspec_map[value] = SharedQuantizationSpec((input, node))
+        output_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 6 unbind + getitem case
+# (inputQuant--unbinde--no Qunat) --> (no Qunat--getitem--outputQuant)
+@register_annotator([torch.ops.aten.unbind.int])
+def annotate_unbind(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    input = node.args[0]
+    assert isinstance(input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(input):
+        return
+
+    # get QuantAnnot from input path
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((shared_quant_node, node))
+    else:
+        # if no QuantAnnot in input path, asign the default QuantAnnot from quant_config.
+        input_qspec_map[input] = quant_config.input_activation
+        shared_qspec = SharedQuantizationSpec((input, node))
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+    for users_node in node.users:
+        users_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# CASE 7: stand-alone Conv2d and Conv1d
+@register_annotator(
+    [
+        torch.ops.aten.conv2d.default,
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.linear.default,
+    ]
+)
+def annotate_conv2d(node: Node, quant_config: QuantizationConfig) -> None:
+    # skipping quantization if weights are not float
+    if _is_annotated([node]) or not _is_float_tensor(node.args[1]):
+        return
+
+    input = node.args[0]
+    # input & weight (or bias) setting for Conv node(producer_node)
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    shared_quant_node = _get_quantization_annotation(input)
+    if shared_quant_node:
+        quantization_annotation.input_qspec_map[input] = SharedQuantizationSpec(
+            shared_quant_node
+        )
+    else:
+        quantization_annotation.input_qspec_map[input] = quant_config.input_activation
+    quantization_annotation.input_qspec_map[node.args[1]] = quant_config.weight
+    if len(node.args) > 2 and quant_config.bias is not None:
+        quantization_annotation.input_qspec_map[node.args[2]] = quant_config.bias
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 8: embedding
+@register_annotator([torch.ops.aten.embedding.default])
+def annotate_embedding(node: Node, quant_config: QuantizationConfig) -> None:
+    input_qspec_map = {}
+    weight = node.args[0]
+    if _is_annotated([node]) or not _is_float_tensor(weight):
+        return
+
+    input_qspec_map[weight] = quant_config.input_activation
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=quant_config.output_activation,
+        _annotated=True,
+    )
+
+
+# CASE 9: Concat & Stack
+@register_annotator(
+    [
+        torch.ops.aten.cat.default,
+        torch.ops.aten.concat.default,
+        torch.ops.aten.stack.default,
+    ]
+)
+def annotate_cat(node: Node, quant_config: QuantizationConfig) -> None:
+    inputs = node.args[0]
+    first_input = inputs[0]
+    assert isinstance(inputs, list)
+    assert isinstance(first_input, Node)
+
+    if _is_annotated([node]) or not _is_float_tensor(first_input):
+        return
+
+    input_qspec_map = {}
+    shared_qspec = SharedQuantizationSpec((first_input, node))
+    for input in inputs:
+        if input == first_input:
+            input_qspec_map[input] = quant_config.input_activation
+        else:
+            input_qspec_map[input] = shared_qspec
+
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=shared_qspec,
+        _annotated=True,
+    )
+
+
+# CASE 10: various normalizations
+@register_annotator([torch.ops.aten.rms_norm.default])
+def annotate_rms_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten.group_norm.default])
+def annotate_group_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.weight
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten.layer_norm.default])
+def annotate_layer_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+    quantization_annotation.input_qspec_map[node.args[2]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+@register_annotator([torch.ops.aten._native_batch_norm_legit_no_training.default])
+def annotate_batch_norm(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+
+    quantization_annotation.input_qspec_map[node.args[0]] = (
+        quant_config.input_activation
+    )  # active
+
+    quantization_annotation.input_qspec_map[node.args[1]] = (
+        quant_config.input_activation
+    )  # weight
+    quantization_annotation.output_qspec = quant_config.output_activation
+
+    node.meta["quantization_annotation"] = quantization_annotation
+    node.meta["quantization_annotation"]._annotated = True
+
+
+# CASE 11: Sigmoid
+@register_annotator([torch.ops.aten.sigmoid, torch.ops.aten.sigmoid.default])
+def annotate_sigmoid(node: Node, quant_config: QuantizationConfig) -> None:
+    if _is_annotated([node]):
+        return
+
+    input_qspec_map = {}
+    input_act = node.args[0]
+    input_qspec_map[input_act] = quant_config.input_activation
+
+    assert isinstance(input_act, Node)
+    out_qconf = quant_config.output_activation
+
+    q_max = (
+        torch.iinfo(out_qconf.dtype).max
+        if out_qconf.quant_max is None
+        else out_qconf.quant_max
+    )
+    q_min = (
+        torch.iinfo(out_qconf.dtype).min
+        if out_qconf.quant_min is None
+        else out_qconf.quant_min
+    )
+
+    scale = 1 / (q_max - q_min + 1)
+
+    bias_obs_ctr = FixedQParamsObserver.with_args(
+        scale=scale,
+        zero_point=0,
+        dtype=quant_config.output_activation.dtype,
+        qscheme=torch.torch.per_tensor_affine,
+        quant_max=q_max,
+        quant_min=q_min,
+    )
+
+    # make sigmoid map to the range between 0~1
+    out_act_quantization_spec = QuantizationSpec(
+        dtype=quant_config.output_activation.dtype,
+        quant_max=q_max,
+        quant_min=q_min,
+        observer_or_fake_quant_ctr=bias_obs_ctr,
+        qscheme=torch.torch.per_tensor_affine,
+    )
+
+    if _is_float_tensor(node):
+        node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=out_act_quantization_spec,
+            _annotated=True,
+        )
diff --git a/backends/samsung/quantizer/qconfig.py b/backends/samsung/quantizer/qconfig.py
new file mode 100644
index 00000000000..f32c8d39796
--- /dev/null
+++ b/backends/samsung/quantizer/qconfig.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import IntEnum, unique
+from typing import Callable, Optional
+
+import torch
+from torchao.quantization.pt2e import (
+    FakeQuantize,
+    MinMaxObserver,
+    PerChannelMinMaxObserver,
+)
+from torchao.quantization.pt2e.quantizer import QuantizationSpec
+
+
+@unique
+class Precision(IntEnum):
+    A8W8 = 3
+
+
+@dataclass(eq=True, frozen=True)
+class QuantizationConfig:
+    input_activation: Optional[QuantizationSpec]
+    output_activation: Optional[QuantizationSpec]
+    weight: Optional[QuantizationSpec]
+    bias: Optional[QuantizationSpec | Callable]
+
+
+def get_quant_config(
+    precision: Precision,
+    is_per_channel: bool = False,
+    is_qat: bool = False,
+) -> QuantizationConfig:
+
+    precision_mappings = {
+        Precision.A8W8: get_a8w8_enn_quant_config,
+    }
+    if precision not in precision_mappings:
+        raise RuntimeError("Unrecognized precision setting.")
+
+    is_weight_symm = is_per_channel
+
+    qconfig_fn = precision_mappings[precision]
+    return qconfig_fn(is_per_channel, is_qat, wei_symmetric=is_weight_symm)
+
+
+def _get_activation_qspec(
+    dtype,
+    is_symmetric,
+    is_qat,
+    observer_cls=MinMaxObserver,
+    quant_min=None,
+    quant_max=None,
+):
+    eps_value = 2**-12
+    if quant_max is None:
+        quant_max = torch.iinfo(dtype).max
+    if quant_min is None:
+        quant_min = torch.iinfo(dtype).min
+
+    qscheme = torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine
+    if is_qat:
+        observer_or_fake_quant = FakeQuantize.with_args(
+            observer=observer_cls, eps=eps_value
+        )
+    else:
+        observer_or_fake_quant = observer_cls.with_args(eps=eps_value)
+
+    return QuantizationSpec(
+        dtype=dtype,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
+        observer_or_fake_quant_ctr=observer_or_fake_quant,
+    )
+
+
+def _get_weight_qspec(
+    dtype, is_symmetric, is_per_channel, is_qat, quant_min=None, quant_max=None
+):
+    assert is_symmetric or not is_per_channel, "Not support asymm+perchannel mode"
+
+    eps_value = 2**-12
+
+    if quant_max is None:
+        quant_max = torch.iinfo(dtype).max
+    if quant_min is None:
+        quant_min = torch.iinfo(dtype).min
+
+    if not is_per_channel:
+        qscheme = (
+            torch.per_tensor_symmetric if is_symmetric else torch.per_tensor_affine
+        )
+        observer_cls = MinMaxObserver
+    else:
+        qscheme = (
+            torch.per_channel_symmetric if is_symmetric else torch.per_channel_affine
+        )
+        observer_cls = PerChannelMinMaxObserver
+
+    if is_qat:
+        observer_or_fake_quant = FakeQuantize.with_args(
+            observer=observer_cls, eps=eps_value
+        )
+    else:
+        observer_or_fake_quant = observer_cls.with_args(eps=eps_value)
+
+    return QuantizationSpec(
+        dtype=dtype,
+        quant_min=quant_min,
+        quant_max=quant_max,
+        qscheme=qscheme,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=observer_or_fake_quant,
+    )
+
+
+def get_a8w8_enn_quant_config(
+    is_per_channel=True, is_qat=False, act_symmetric=False, wei_symmetric=False
+) -> QuantizationConfig:
+    act_quantization_spec = _get_activation_qspec(torch.int8, act_symmetric, is_qat)
+    wgt_quantization_spec = _get_weight_qspec(
+        torch.int8, wei_symmetric, is_per_channel, is_qat
+    )
+    bias_quantization_spec = None
+    quantization_config = QuantizationConfig(
+        input_activation=act_quantization_spec,
+        output_activation=act_quantization_spec,
+        weight=wgt_quantization_spec,
+        bias=bias_quantization_spec,
+    )
+    return quantization_config
+
+
+class QuantInfo:
+    def __init__(self, torch_dtype: torch.dtype, string: str):
+        self._torch_dtype = torch_dtype
+        self._string = string
+
+    @property
+    def torch_dtype(self):
+        return self._torch_dtype
+
+    @property
+    def string(self):
+        return self._string
+
+
+class QuantInfoManager:
+    QUANT_INFO_MAP = {
+        Precision.A8W8: (QuantInfo(torch.int8, "INT8"), QuantInfo(torch.int8, "INT8")),
+    }
+    FP_INFO = (
+        QuantInfo(torch.float32, "FLOAT32"),
+        QuantInfo(torch.float32, "FLOAT32"),
+    )
+
+    def __init__(self):
+        self.precision = None
+
+    def set_precision(self, precision: Precision):
+        self.precision = precision
+
+    @property
+    def weight_precison(self) -> Optional[QuantInfo]:
+        return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[0]
+
+    @property
+    def act_precision(self) -> Optional[QuantInfo]:
+        return self.QUANT_INFO_MAP.get(self.precision, self.FP_INFO)[1]
diff --git a/backends/samsung/quantizer/quantizer.py b/backends/samsung/quantizer/quantizer.py
new file mode 100644
index 00000000000..cf46677d000
--- /dev/null
+++ b/backends/samsung/quantizer/quantizer.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Sequence
+
+import torch
+from torch.fx import GraphModule
+from torchao.quantization.pt2e.quantizer import Quantizer
+
+from .annotator import annotate
+from .qconfig import get_quant_config, Precision, QuantInfoManager
+
+
+global_quant_info = QuantInfoManager()
+
+
+class EnnQuantizer(Quantizer):
+
+    def __init__(self):
+        super().__init__()
+
+        self._precision = Precision.A8W8
+        global_quant_info.set_precision(self._precision)
+        self._is_per_channel = True
+        self._is_qat = False
+        self.custom_quant_annotations: Sequence[Callable] = []
+
+    def setup_precision(self, quant_dtype: Precision) -> None:
+        assert quant_dtype in Precision, f"No support for Precision {quant_dtype}."
+        self._precision = quant_dtype
+        global_quant_info.set_precision(self._precision)
+
+    def setup_quant_params(
+        self, quant_dtype: Precision, is_per_channel=True, is_qat=False
+    ) -> None:
+        assert quant_dtype in Precision, f"No support for Precision {quant_dtype}."
+        self._precision = quant_dtype
+        self._is_per_channel = is_per_channel
+        self._is_qat = is_qat
+
+    def annotate(self, model: GraphModule) -> GraphModule:
+        self._annotate(model)
+        self._annotate_custom_annotation(model)
+        return model
+
+    def _annotate(self, gm: GraphModule) -> None:
+        quant_config = get_quant_config(
+            self._precision, self._is_per_channel, self._is_qat
+        )
+        annotate(gm.graph, quant_config)
+
+    def add_custom_quant_annotations(
+        self, custom_quant_annotations: Sequence[Callable]
+    ) -> None:
+        self.custom_quant_annotations = custom_quant_annotations
+
+    def _annotate_custom_annotation(self, gm: GraphModule) -> None:
+        for annotation_func in self.custom_quant_annotations:
+            annotation_func(gm)
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        return
diff --git a/backends/samsung/serialization/compile_options.py b/backends/samsung/serialization/compile_options.py
index 1ad2350cfeb..a4af40368e9 100644
--- a/backends/samsung/serialization/compile_options.py
+++ b/backends/samsung/serialization/compile_options.py
@@ -11,7 +11,8 @@
 from dataclasses import dataclass
 from enum import IntEnum, unique
 
-import pkg_resources
+from importlib.resources import files
+
 from executorch.exir._serialize._dataclass import _DataclassEncoder
 from executorch.exir._serialize._flatbuffer import _flatc_compile
 from executorch.exir.backend.backend_details import CompileSpec
@@ -36,12 +37,15 @@ def gen_samsung_backend_compile_spec_core(options: EnnExecuTorchOptions) -> Comp
     with tempfile.TemporaryDirectory() as d:
         # schema
         schema_path = os.path.join(d, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME))
+
+        schema_content = (
+            files(__package__)
+            .joinpath(f"{COMPILE_OPTION_SCHEMA_NAME}.fbs")
+            .read_bytes()
+        )
+
         with open(schema_path, "wb") as schema_file:
-            schema_file.write(
-                pkg_resources.resource_string(
-                    __name__, "{}.fbs".format(COMPILE_OPTION_SCHEMA_NAME)
-                )
-            )
+            schema_file.write(schema_content)
         # dump json
         json_path = os.path.join(d, "{}.json".format(COMPILE_OPTION_SCHEMA_NAME))
         enn_options_json = json.dumps(options, cls=_DataclassEncoder, indent=4)
diff --git a/backends/samsung/serialization/enn_graph_schema.py b/backends/samsung/serialization/enn_graph_schema.py
index 7e74182f9d7..5209a8672ee 100644
--- a/backends/samsung/serialization/enn_graph_schema.py
+++ b/backends/samsung/serialization/enn_graph_schema.py
@@ -5,13 +5,16 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import executorch.backends.samsung.python.PyGraphWrapperAdaptor as PyGraphWrapper
 
 import numpy as np
 
 import torch
+from executorch.backends.samsung.builders.utils import DATA_TYPE_STR_MAPPING
+from executorch.backends.samsung.utils.constants import QuantConstants
+from executorch.backends.samsung.utils.utils import quantize_tensor
 
 
 class EnnGraph:
@@ -24,6 +27,10 @@ def __init__(self):
         self.inputs = []
         self.outputs = []
 
+    def init(self, name: str, soc_name):
+        self.name = name
+        self.soc_name = soc_name
+
     def define_op(
         self,
         name,
@@ -46,22 +53,54 @@ def define_op(
                     py_param_wrapper.SetScalarValue(params[key])
                 else:
                     logging.error("Unsupported param type.")
+                # Set
                 op.AddOpParam(py_param_wrapper)
 
         self.graph.DefineOpNode(op)
 
-    def define_tensor(
+    def define_tensor(  # noqa: C901
         self,
         name: str,
         shape: List,
         data_type: str,
         tensor_type: str,
         data: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        quant_param: Optional[Dict[str, Any]] = None,
     ) -> int:
         layout = "NCHW" if len(shape) == 4 else "UNDEFINED"
 
+        if quant_param is not None:
+            data_type = DATA_TYPE_STR_MAPPING[
+                quant_param[QuantConstants.QUANT_KEY.quant_dtype]
+            ]
+
         tensor = PyGraphWrapper.PyEnnTensorWrapper(name, shape, data_type, layout)
 
+        if quant_param is not None:
+            need_quantize = True
+
+            scales = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.scale]
+            )
+            zero_points = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.zero_point]
+            )
+            q_dtype = self._affine_meta_param(
+                quant_param[QuantConstants.QUANT_KEY.quant_dtype]
+            )
+            tensor.AddQuantizeParam(q_dtype, scales, zero_points)
+
+            if need_quantize and data is not None:
+                if isinstance(data, np.ndarray):
+                    data = torch.tensor(data)
+                data = quantize_tensor(
+                    data,
+                    scales,
+                    zero_points,
+                    quant_param[QuantConstants.QUANT_KEY.quant_dtype],
+                    axis=quant_param.get("axis"),
+                )
+
         if data is not None:
             if isinstance(data, torch.Tensor):
                 data = data.detach().numpy()
@@ -83,3 +122,20 @@ def finish(self):
 
     def serialize(self):
         return self.graph.Serialize()
+
+    @staticmethod
+    def _affine_meta_param(param: Any) -> str:
+        type_str_affine_table = {
+            torch.int8: "AINT8",
+        }
+        if isinstance(param, str):
+            return param
+        if isinstance(param, (float, int)):
+            return [param]
+        if hasattr(param, "tolist"):
+            return param.tolist()
+        if isinstance(param, torch.dtype):
+            # Convenient for debugging
+            param = type_str_affine_table.get(param, "")
+
+        return param
diff --git a/backends/samsung/utils/constants.py b/backends/samsung/utils/constants.py
new file mode 100644
index 00000000000..7c3997b9fe2
--- /dev/null
+++ b/backends/samsung/utils/constants.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class QuantConstants:
+    # TODO: check keys
+    class QUANT_KEY:
+        scale = "scales"
+        zero_point = "zero_points"
+        quant_min = "quant_min"
+        quant_max = "quant_max"
+        quant_dtype = "quant_dtype"
+
+    PERCHANNEL_KEY_MAP = {
+        "scales": QUANT_KEY.scale,
+        "zero_points": QUANT_KEY.zero_point,
+        "quant_min": QUANT_KEY.quant_min,
+        "quant_max": QUANT_KEY.quant_max,
+        "dtype": QUANT_KEY.quant_dtype,
+    }
+    # SNC ir always use key 'scales' and 'zero_points'
+    PERTENSOR_KEY_MAP = {
+        "scale": QUANT_KEY.scale,
+        "zero_point": QUANT_KEY.zero_point,
+        "quant_min": QUANT_KEY.quant_min,
+        "quant_max": QUANT_KEY.quant_max,
+        "dtype": QUANT_KEY.quant_dtype,
+    }
+
+    QUANT_OPS_KEY_MAP = {
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default: PERCHANNEL_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor: PERTENSOR_KEY_MAP,
+    }
+
+    DEQUANT_OPS_KEY_MAP = {
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor: PERTENSOR_KEY_MAP,
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default: PERCHANNEL_KEY_MAP,
+    }
diff --git a/backends/samsung/utils/export_utils.py b/backends/samsung/utils/export_utils.py
index aaf407ef0b3..39992f2ea2a 100644
--- a/backends/samsung/utils/export_utils.py
+++ b/backends/samsung/utils/export_utils.py
@@ -4,20 +4,30 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Tuple
+import logging
+from typing import List, Optional, Tuple
 
 import executorch.exir as exir
 import torch
+from executorch.backends.samsung._passes.fuse_conv_act import FuseConvActPass
+from executorch.backends.samsung._passes.remove_useless_ops import RemoveUselessOpPass
 from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer.quantizer import EnnQuantizer, Precision
+from executorch.backends.transforms.decompose_sdpa import (
+    DecomposeScaledDotProductAttention,
+)
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.backend_details import CompileSpec
-
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_manager import PassType
 from executorch.exir.program._program import to_edge_transform_and_lower
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 
 def get_edge_compile_config():
+    # Maybe most ops in non-decomposition list should be added here
+    # TODO: to confirm whether all op in none-decomposed table should be added here
     return EdgeCompileConfig(
         _skip_dim_order=True,
         _core_aten_ops_exception_list=[
@@ -29,24 +39,55 @@ def get_edge_compile_config():
             exir_ops.edge.aten._safe_softmax.default,
             exir_ops.edge.aten.layer_norm.default,
             exir_ops.edge.aten.matmul.default,
+            exir_ops.edge.aten.hardsigmoid.default,
         ],
     )
 
 
+def get_enn_pass_list() -> List[PassType]:
+    return [
+        RemoveUselessOpPass(),
+        RemoveCloneOpsTransform(),
+        FuseConvActPass(),
+    ]
+
+
+def quantize_module(
+    module: torch.nn.Module,
+    inputs,
+    calibration_dataset,
+    precision: Precision,
+    is_per_channel: bool = True,
+    is_qat: bool = False,
+) -> torch.nn.Module:
+    quantizer = EnnQuantizer()
+    quantizer.setup_quant_params(precision, is_per_channel, is_qat)
+    logging.info("Export nn module for quantization...")
+    exported_module = torch.export.export_for_training(module, inputs).module()
+    DecomposeScaledDotProductAttention()(exported_module)
+    logging.info("Quantizing the module...")
+    annotated_module = prepare_pt2e(exported_module, quantizer)
+    for data in calibration_dataset:
+        annotated_module(*data)
+    quantized_module = convert_pt2e(annotated_module, fold_quantize=False)
+    logging.info("Quantizing finished.")
+    return quantized_module
+
+
 def to_edge_transform_and_lower_to_enn(
     module: torch.nn.Module,
     inputs: Tuple[torch.Tensor],
+    custom_pass_config: List[PassType] = None,
     compile_specs: Optional[CompileSpec] = None,
 ) -> exir.ExecutorchProgramManager:
-    assert (
-        compile_specs is not None
-    ), "Please provide compile specifications for enn backend"
+    assert compile_specs is not None, "For now, we must deliver complile specs"
     prog = torch.export.export(module, inputs)
-
-    ahead_pass_list = [RemoveCloneOpsTransform()]
+    pass_list = get_enn_pass_list()
+    if custom_pass_config:
+        pass_list.extend(custom_pass_config)
     return to_edge_transform_and_lower(
         prog,
-        ahead_pass_list,
+        pass_list,
         {"forward": [EnnPartitioner(compile_specs)]},
         compile_config=get_edge_compile_config(),
     )
diff --git a/backends/samsung/utils/utils.py b/backends/samsung/utils/utils.py
index 5da9808f38f..bbbec518b2a 100644
--- a/backends/samsung/utils/utils.py
+++ b/backends/samsung/utils/utils.py
@@ -4,12 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List
+from typing import List, Optional, Tuple
 
 import torch
 
 from executorch.backends.transforms.utils import is_param_node
 from executorch.exir.backend.backend_details import CompileSpec
+from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.export.exported_program import ExportedProgram
 
@@ -35,3 +36,90 @@ def is_graph_output(node: torch.fx.Node) -> bool:
         ):
             return True
     return False
+
+
+def _quantize_per_tensor(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]],
+):
+    assert (
+        len(scales) == 1
+    ), "For per-tensor quantization, there should be only one scale/zeropoint"
+    return exir_ops.edge.quantized_decomposed.quantize_per_tensor.default(
+        in_tensor,
+        torch.Tensor(scales),
+        torch.Tensor(zeropoints),
+        qrange[0],
+        qrange[1],
+        dtype,
+    )
+
+
+def _quantize_per_channel(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]],
+    axis: Optional[int],  # Only for per-channel
+):
+    assert (
+        len(scales) == in_tensor.shape[axis]
+    ), "Shape not match for quant params and input tensor"
+    return exir_ops.edge.quantized_decomposed.quantize_per_channel.default(
+        in_tensor,
+        torch.Tensor(scales),
+        torch.Tensor(zeropoints),
+        axis,
+        qrange[0],
+        qrange[1],
+        dtype,
+    )
+
+
+def quantize_tensor(
+    in_tensor: torch.Tensor,
+    scales: List[float],
+    zeropoints: List[int],
+    dtype: torch.dtype,
+    qrange: Optional[Tuple[int, int]] = None,
+    axis: Optional[int] = None,  # Only for per-channel
+) -> torch.Tensor:
+    """
+    To quantize constant tensor by executorch OPs. If `axis` not set, we quantize the tensor by per tensor.
+    If `axis` was set, we do per-channel quantize.
+
+    :param in_tensor: The tensor to be quantized
+    :param scales: List of scales. For per-tensor quantization, it should contain only one element
+    :param zeropoints: List of zeropoints. For per-tensor quantization, it should contain only one element
+    :param dtype: The output dtype
+    :param qrange: The quantization range (qmin, qmax).
+        If not set, we will get the maximum range of the dtype by `torch.iinfo`
+    :param axis: We do per-channel quantize by which axis.
+        Only when this parameter set, we do per-channel quantization
+    :type in_tensor: torch.Tensor
+    :type scalse: List[float]
+    :type zeropoints: List[int]
+    :type dtype: torch.dtype
+    :type qrange: Optional[Tuple[int,int]]
+    :type axis: Optional[int]
+    :return: The quantized tensor
+    """
+    assert len(scales) == len(
+        zeropoints
+    ), "scales should have same shape with zeropoints"
+    if not qrange:
+        qrange = (torch.iinfo(dtype).min, torch.iinfo(dtype).max)
+
+    if axis is not None:
+        return _quantize_per_channel(in_tensor, scales, zeropoints, dtype, qrange, axis)
+    return _quantize_per_tensor(
+        in_tensor,
+        scales,
+        zeropoints,
+        dtype,
+        qrange,
+    )
diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
index 351bab4a605..02c6fc4c82d 100644
--- a/backends/test/harness/tester.py
+++ b/backends/test/harness/tester.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import random
 from collections import Counter, OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -62,6 +67,7 @@ def __init__(
             StageType.RUN_PASSES: [
                 StageType.PARTITION,
                 StageType.TO_EDGE_TRANSFORM_AND_LOWER,
+                StageType.TO_EXECUTORCH,
             ],
             # TODO Make this Stage optional
             StageType.PARTITION: [StageType.TO_EXECUTORCH],
diff --git a/backends/test/multi_method_delegate_test.cpp b/backends/test/multi_method_delegate_test.cpp
index e24585434c4..bf17d7c8743 100644
--- a/backends/test/multi_method_delegate_test.cpp
+++ b/backends/test/multi_method_delegate_test.cpp
@@ -5,6 +5,10 @@
 #include <thread>
 #include <vector>
 
+#include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/executor/program.h>
 #include <executorch/runtime/platform/runtime.h>
 
@@ -12,6 +16,11 @@
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
 #include <executorch/extension/runner_util/inputs.h>
 
+using executorch::backends::xnnpack::workspace_sharing_mode_option_key;
+using executorch::backends::xnnpack::WorkspaceSharingMode;
+using executorch::backends::xnnpack::xnnpack_backend_key;
+
+using executorch::runtime::BackendOptions;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::HierarchicalAllocator;
@@ -126,34 +135,61 @@ class XNNPACKMultiDelegateTest : public ETPTEMethodRunBaseTest {
     num_threads = 40;
     kMethodName = "forward";
   }
-};
 
-// This test is to validate the assumption that the delegate is thread safe.
-// That includes the following:
-// 1. The delegate can be initilized by multiple threads in parallel.
-// 2. The delegate can be executed by multiple threads in parallel.
-// 3. The delegate can be destroyed by multiple threads in parallel.
-// Regardless of the underlying implementation of the delegate.
-// This is particularly important when we have shared resources across
-// delegate instances through a singleton backend instance.
-TEST_F(XNNPACKMultiDelegateTest, MultipleThreads) {
-  ASSERT_NE(kTestPTE1Path.size(), 0);
-  ASSERT_NE(kTestPTE2Path.size(), 0);
-  ASSERT_NE(num_threads, 0);
-  ASSERT_NE(kMethodName.size(), 0);
-
-  std::vector<std::thread> threads(num_threads);
-  std::atomic<size_t> count{0};
-
-  for (int i = 0; i < num_threads; i++) {
-    threads[i] = std::thread([&, i]() {
-      run(i, i % 7 ? kTestPTE1Path : kTestPTE2Path, kMethodName, count);
-    });
+  // This test is to validate the assumption that the delegate is thread safe.
+  // That includes the following:
+  // 1. The delegate can be initilized by multiple threads in parallel.
+  // 2. The delegate can be executed by multiple threads in parallel.
+  // 3. The delegate can be destroyed by multiple threads in parallel.
+  // Regardless of the underlying implementation of the delegate.
+  // This is particularly important when we have shared resources across
+  // delegate instances through a singleton backend instance.
+  void runStressTest() {
+    ASSERT_NE(kTestPTE1Path.size(), 0);
+    ASSERT_NE(kTestPTE2Path.size(), 0);
+    ASSERT_NE(num_threads, 0);
+    ASSERT_NE(kMethodName.size(), 0);
+
+    std::vector<std::thread> threads(num_threads);
+    std::atomic<size_t> count{0};
+
+    for (int i = 0; i < num_threads; i++) {
+      threads[i] = std::thread([&, i]() {
+        run(i, i % 7 ? kTestPTE1Path : kTestPTE2Path, kMethodName, count);
+      });
+    }
+    for (int i = 0; i < num_threads; i++) {
+      threads[i].join();
+    }
+    ASSERT_EQ(count, num_threads);
   }
-  for (int i = 0; i < num_threads; i++) {
-    threads[i].join();
+
+  void setWorkspaceSharingMode(WorkspaceSharingMode mode) {
+    executorch::runtime::runtime_init();
+
+    BackendOptions<1> backend_options;
+    backend_options.set_option(
+        workspace_sharing_mode_option_key, static_cast<int>(mode));
+
+    auto status = executorch::runtime::set_option(
+        xnnpack_backend_key, backend_options.view());
+    ASSERT_EQ(status, Error::Ok);
   }
-  ASSERT_EQ(count, num_threads);
+};
+
+TEST_F(XNNPACKMultiDelegateTest, MultipleThreadsSharingDisabled) {
+  setWorkspaceSharingMode(WorkspaceSharingMode::Disabled);
+  runStressTest();
+}
+
+TEST_F(XNNPACKMultiDelegateTest, MultipleThreadsPerModelSharing) {
+  setWorkspaceSharingMode(WorkspaceSharingMode::PerModel);
+  runStressTest();
+}
+
+TEST_F(XNNPACKMultiDelegateTest, MultipleThreadsGlobalSharing) {
+  setWorkspaceSharingMode(WorkspaceSharingMode::Global);
+  runStressTest();
 }
 
 // TODO(T208989291): Add more tests here. For example,
diff --git a/backends/test/suite/README.md b/backends/test/suite/README.md
index 564f44362ad..901cd461dbe 100644
--- a/backends/test/suite/README.md
+++ b/backends/test/suite/README.md
@@ -5,37 +5,71 @@ This directory contains tests that validate correctness and coverage of backends
 These tests are intended to ensure that backends are robust and provide a smooth, "out-of-box" experience for users across the full span of input patterns. They are not intended to be a replacement for backend-specific tests, as they do not attempt to validate performance or that backends delegate operators that they expect to.
 
 ## Running Tests and Interpreting Output
-Tests can be run from the command line, either using the runner.py entry point or the standard Python unittest runner. When running through runner.py, the test runner will report test statistics, including the number of tests with each result type.
+Tests can be run from the command line using pytest. When generating a JSON test report, the runner will report detailed test statistics, including output accuracy, delegated nodes, lowering timing, and more.
 
-Backends can be specified with the `ET_TEST_ENABLED_BACKENDS` environment variable. By default, all available backends are enabled. Note that backends such as Core ML or Vulkan may require specific hardware or software to be available. See the documentation for each backend for information on requirements.
+Each backend and test flow (recipe) registers a pytest [marker](https://docs.pytest.org/en/stable/example/markers.html) that can be passed to pytest with the `-m marker` argument to filter execution.
 
-Example:
+To run all XNNPACK backend operator tests:
 ```
-ET_TEST_ENABLED_BACKENDS=xnnpack python -m executorch.backends.test.suite.runner
+pytest -c /dev/nul backends/test/suite/operators/ -m backend_xnnpack -n auto
 ```
 
+To run all model tests for the CoreML static int8 lowering flow:
+```
+pytest -c /dev/nul backends/test/suite/models/ -m flow_coreml_static_int8 -n auto
 ```
-2465 Passed / 2494
-16 Failed
-13 Skipped
 
-[Success]
-736 Delegated
-1729 Undelegated
+To run a specific test:
+```
+pytest -c /dev/nul backends/test/suite/ -k "test_prelu_f32_custom_init[xnnpack]"
+```
 
-[Failure]
-5 Lowering Fail
-3 PTE Run Fail
-8 Output Mismatch Fail
+To generate a JSON report:
+```
+pytest -c /dev/nul backends/test/suite/operators/ -n auto --json-report --json-report-file="test_report.json"
 ```
 
-Outcomes can be interpreted as follows:
- * Success (delegated): The test passed and at least one op was delegated by the backend.
- * Success (undelegated): The test passed with no ops delegated by the backend. This is a pass, as the partitioner works as intended.
- * Skipped: test fails in eager or export (indicative of a test or dynamo issue).
- * Lowering fail: The test fails in to_edge_transform_and_lower.
- * PTE run failure: The test errors out when loading or running the method.
- * Output mismatch failure: Output delta (vs eager) exceeds the configured tolerance.
+See [pytest-json-report](https://pypi.org/project/pytest-json-report/) for information on the report format. The test logic in this repository attaches additional metadata to each test entry under the `metadata`/`subtests` keys. One entry is created for each call to `test_runner.lower_and_run_model`.
+
+Here is a excerpt from a test run, showing a successful run of the `test_add_f32_bcast_first[xnnpack]` test.
+```json
+"tests": [
+    {
+      "nodeid": "operators/test_add.py::test_add_f32_bcast_first[xnnpack]",
+      "lineno": 38,
+      "outcome": "passed",
+      "keywords": [
+        "test_add_f32_bcast_first[xnnpack]",
+        "flow_xnnpack",
+        "backend_xnnpack",
+        ...
+      ],
+      "metadata": {
+        "subtests": [
+          {
+            "Test ID": "test_add_f32_bcast_first[xnnpack]",
+            "Test Case": "test_add_f32_bcast_first",
+            "Subtest": 0,
+            "Flow": "xnnpack",
+            "Result": "Pass",
+            "Result Detail": "",
+            "Error": "",
+            "Delegated": "True",
+            "Quantize Time (s)": null,
+            "Lower Time (s)": "2.881",
+            "Output 0 Error Max": "0.000",
+            "Output 0 Error MAE": "0.000",
+            "Output 0 SNR": "inf",
+            "Delegated Nodes": 1,
+            "Undelegated Nodes": 0,
+            "Delegated Ops": {
+              "aten::add.Tensor": 1
+            },
+            "PTE Size (Kb)": "1.600"
+          }
+        ]
+      }
+```
 
 ## Backend Registration
 
@@ -43,11 +77,11 @@ To plug into the test framework, each backend should provide an implementation o
 
 At a minimum, the backend will likely need to provide a custom implementation of the Partition and ToEdgeTransformAndLower stages using the appropriate backend partitioner. See backends/xnnpack/test/tester/tester.py for an example implementation.
 
-Once a tester is available, the backend flow(s) can be added in __init__.py in this directory by adding an entry to `ALL_TESTER_FLOWS`. Each flow entry consists of a name (used in the test case naming) and a function to instantiate a tester for a given model and input tuple.
+Once a tester is available, the backend flow(s) can be added under flows/ and registered in flow.py. It is intended that this will be unified with the lowering recipes under executorch/export in the near future.
 
 ## Test Cases
 
-Operator test cases are defined under the operators/ directory. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow. The `@operator_test` decorator is applied to each test class to trigger this behavior. Tests can also be tagged with an appropriate type specifier, such as `@dtype_test`, to generate variants for each dtype. The decorators and "magic" live in __init__.py in this directory.
+Operator test cases are defined under the operators/ directory. Model tests are under models/. Tests are written in a backend-independent manner, and each test is programmatically expanded to generate a variant for each registered backend flow by use of the `test_runner` fixture parameter. Tests can additionally be parameterized using standard pytest decorators. Parameterizing over dtype is a common use case.
 
 ## Evolution of this Test Suite
 
diff --git a/backends/test/suite/__init__.py b/backends/test/suite/__init__.py
index 43d4e16818f..734a6690fd2 100644
--- a/backends/test/suite/__init__.py
+++ b/backends/test/suite/__init__.py
@@ -11,6 +11,7 @@
 import os
 
 import executorch.backends.test.suite.flow
+import torch
 
 from executorch.backends.test.suite.flow import TestFlow
 from executorch.backends.test.suite.runner import runner_main
@@ -55,6 +56,11 @@ def get_test_flows() -> dict[str, TestFlow]:
     return _ALL_TEST_FLOWS
 
 
+def dtype_to_str(dtype: torch.dtype) -> str:
+    # Strip off "torch."
+    return str(dtype)[6:]
+
+
 def load_tests(loader, suite, pattern):
     package_dir = os.path.dirname(__file__)
     discovered_suite = loader.discover(
diff --git a/backends/test/suite/conftest.py b/backends/test/suite/conftest.py
new file mode 100644
index 00000000000..70a97454c4e
--- /dev/null
+++ b/backends/test/suite/conftest.py
@@ -0,0 +1,182 @@
+from typing import Any
+
+import pytest
+import torch
+
+from executorch.backends.test.suite.flow import all_flows
+from executorch.backends.test.suite.reporting import _sum_op_counts
+from executorch.backends.test.suite.runner import run_test
+
+
+def pytest_configure(config):
+    backends = set()
+
+    for flow in all_flows().values():
+        config.addinivalue_line(
+            "markers",
+            f"flow_{flow.name}: mark a test as testing the {flow.name} flow",
+        )
+
+        if flow.backend not in backends:
+            config.addinivalue_line(
+                "markers",
+                f"backend_{flow.backend}: mark a test as testing the {flow.backend} backend",
+            )
+            backends.add(flow.backend)
+
+
+class TestRunner:
+    def __init__(self, flow, test_name, test_base_name):
+        self._flow = flow
+        self._test_name = test_name
+        self._test_base_name = test_base_name
+        self._subtest = 0
+        self._results = []
+
+    def lower_and_run_model(
+        self,
+        model: torch.nn.Module,
+        inputs: Any,
+        generate_random_test_inputs=True,
+        dynamic_shapes=None,
+    ):
+        run_summary = run_test(
+            model,
+            inputs,
+            self._flow,
+            self._test_name,
+            self._test_base_name,
+            self._subtest,
+            None,
+            generate_random_test_inputs=generate_random_test_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+
+        self._subtest += 1
+        self._results.append(run_summary)
+
+        if not run_summary.result.is_success():
+            if run_summary.result.is_backend_failure():
+                raise RuntimeError("Test failure.") from run_summary.error
+            else:
+                # Non-backend failure indicates a bad test. Mark as skipped.
+                pytest.skip(
+                    f"Test failed for reasons other than backend failure. Error: {run_summary.error}"
+                )
+
+
+@pytest.fixture(
+    params=[
+        pytest.param(
+            f,
+            marks=[
+                getattr(pytest.mark, f"flow_{f.name}"),
+                getattr(pytest.mark, f"backend_{f.backend}"),
+            ],
+        )
+        for f in all_flows().values()
+    ],
+    ids=str,
+)
+def test_runner(request):
+    return TestRunner(request.param, request.node.name, request.node.originalname)
+
+
+@pytest.hookimpl(optionalhook=True)
+def pytest_json_runtest_metadata(item, call):
+    # Store detailed results in the test report under the metadata key.
+    metadata = {"subtests": []}
+
+    if hasattr(item, "funcargs") and "test_runner" in item.funcargs:
+        runner_instance = item.funcargs["test_runner"]
+
+        for record in runner_instance._results:
+            subtest_metadata = {}
+
+            error_message = ""
+            if record.error is not None:
+                error_str = str(record.error)
+                if len(error_str) > 400:
+                    error_message = error_str[:200] + "..." + error_str[-200:]
+                else:
+                    error_message = error_str
+
+            subtest_metadata["Test ID"] = record.name
+            subtest_metadata["Test Case"] = record.base_name
+            subtest_metadata["Subtest"] = record.subtest_index
+            subtest_metadata["Flow"] = record.flow
+            subtest_metadata["Result"] = record.result.to_short_str()
+            subtest_metadata["Result Detail"] = record.result.to_detail_str()
+            subtest_metadata["Error"] = error_message
+            subtest_metadata["Delegated"] = "True" if record.is_delegated() else "False"
+            subtest_metadata["Quantize Time (s)"] = (
+                f"{record.quantize_time.total_seconds():.3f}"
+                if record.quantize_time
+                else None
+            )
+            subtest_metadata["Lower Time (s)"] = (
+                f"{record.lower_time.total_seconds():.3f}"
+                if record.lower_time
+                else None
+            )
+
+            for output_idx, error_stats in enumerate(record.tensor_error_statistics):
+                subtest_metadata[f"Output {output_idx} Error Max"] = (
+                    f"{error_stats.error_max:.3f}"
+                )
+                subtest_metadata[f"Output {output_idx} Error MAE"] = (
+                    f"{error_stats.error_mae:.3f}"
+                )
+                subtest_metadata[f"Output {output_idx} SNR"] = f"{error_stats.sqnr:.3f}"
+
+            subtest_metadata["Delegated Nodes"] = _sum_op_counts(
+                record.delegated_op_counts
+            )
+            subtest_metadata["Undelegated Nodes"] = _sum_op_counts(
+                record.undelegated_op_counts
+            )
+            if record.delegated_op_counts:
+                subtest_metadata["Delegated Ops"] = dict(record.delegated_op_counts)
+            if record.undelegated_op_counts:
+                subtest_metadata["Undelegated Ops"] = dict(record.undelegated_op_counts)
+            subtest_metadata["PTE Size (Kb)"] = (
+                f"{record.pte_size_bytes / 1000.0:.3f}" if record.pte_size_bytes else ""
+            )
+
+            metadata["subtests"].append(subtest_metadata)
+    return metadata
+
+
+@pytest.hookimpl(optionalhook=True)
+def pytest_json_modifyreport(json_report):
+    # Post-process the report, mainly to populate metadata for crashed tests. The runtest_metadata
+    # hook doesn't seem to be called when there's a native crash, but xdist still creates a report
+    # entry.
+
+    for test_data in json_report["tests"]:
+        if "metadata" not in test_data:
+            test_data["metadata"] = {}
+        metadata = test_data["metadata"]
+        if "subtests" not in metadata:
+            metadata["subtests"] = []
+        subtests = metadata["subtests"]
+
+        # Native crashes are recorded differently and won't have the full metadata.
+        # Pytest-xdist records crash info under the "???" key.
+        if "???" in test_data:
+            test_id = test_data["nodeid"].removeprefix("::")  # Remove leading ::
+            test_base_id = test_id.split("[")[
+                0
+            ]  # Strip parameterization to get the base test case
+            params = test_id[len(test_base_id) + 1 : -1].split("-")
+            flow = params[0]
+
+            crashed_test_meta = {
+                "Test ID": test_id,
+                "Test Case": test_base_id,
+                "Flow": flow,
+                "Result": "Fail",
+                "Result Detail": "Process Crash",
+                "Error": test_data["???"].get("longrepr", "Process crashed."),
+            }
+            subtests.append(crashed_test_meta)
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
index a4b34fee98d..29394951bd7 100644
--- a/backends/test/suite/flow.py
+++ b/backends/test/suite/flow.py
@@ -1,3 +1,8 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import logging
 
 from dataclasses import dataclass, field
@@ -44,6 +49,9 @@ class TestFlow:
     def should_skip_test(self, test_name: str) -> bool:
         return any(pattern in test_name for pattern in self.skip_patterns)
 
+    def __str__(self):
+        return self.name
+
 
 def all_flows() -> dict[str, TestFlow]:
     flows = []
@@ -119,10 +127,18 @@ def all_flows() -> dict[str, TestFlow]:
         logger.info(f"Skipping QNN flow registration: {e}")
 
     try:
-        from executorch.backends.test.suite.flows.arm import ARM_TOSA_FLOW
+        from executorch.backends.test.suite.flows.arm import (
+            ARM_ETHOS_U55_FLOW,
+            ARM_ETHOS_U85_FLOW,
+            ARM_TOSA_FP_FLOW,
+            ARM_TOSA_INT_FLOW,
+        )
 
         flows += [
-            ARM_TOSA_FLOW,
+            ARM_TOSA_FP_FLOW,
+            ARM_TOSA_INT_FLOW,
+            ARM_ETHOS_U55_FLOW,
+            ARM_ETHOS_U85_FLOW,
         ]
     except Exception as e:
         logger.info(f"Skipping ARM flow registration: {e}")
diff --git a/backends/test/suite/flows/arm.py b/backends/test/suite/flows/arm.py
index baa2df79de9..85674331eda 100644
--- a/backends/test/suite/flows/arm.py
+++ b/backends/test/suite/flows/arm.py
@@ -1,24 +1,68 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Create flows for Arm Backends used to test operator and model suits
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+from executorch.backends.arm.util._factory import create_quantizer
 from executorch.backends.test.suite.flow import TestFlow
+from executorch.backends.xnnpack.test.tester.tester import Quantize
 
 
-def _create_arm_tester_tosa_fp(*args, **kwargs) -> ArmTester:
-    kwargs["compile_spec"] = common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP")
+def _create_arm_flow(
+    name,
+    compile_spec: ArmCompileSpec,
+    symmetric_io_quantization: bool = False,
+    per_channel_quantization: bool = True,
+) -> TestFlow:
 
-    return ArmTester(
-        *args,
-        **kwargs,
-    )
+    def _create_arm_tester(*args, **kwargs) -> ArmTester:
+        kwargs["compile_spec"] = compile_spec
+        return ArmTester(*args, **kwargs)
+
+    support_serialize = not isinstance(compile_spec, TosaCompileSpec)
+    quantize = compile_spec.tosa_spec.support_integer()
+
+    if quantize is True:
 
+        def create_quantize_stage() -> Quantize:
+            quantizer = create_quantizer(compile_spec)
+            quantization_config = get_symmetric_quantization_config(
+                is_per_channel=per_channel_quantization
+            )
+            if symmetric_io_quantization:
+                quantizer.set_io(quantization_config)
+            return Quantize(quantizer, quantization_config)
 
-def _create_tosa_flow() -> TestFlow:
     return TestFlow(
-        "arm_tosa",
+        name,
         backend="arm",
-        tester_factory=_create_arm_tester_tosa_fp,
-        supports_serialize=False,
+        tester_factory=_create_arm_tester,
+        supports_serialize=support_serialize,
+        quantize=quantize,
+        quantize_stage_factory=(create_quantize_stage if quantize is True else False),
     )
 
 
-ARM_TOSA_FLOW = _create_tosa_flow()
+ARM_TOSA_FP_FLOW = _create_arm_flow(
+    "arm_tosa_fp",
+    common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+FP"),
+)
+ARM_TOSA_INT_FLOW = _create_arm_flow(
+    "arm_tosa_int",
+    common.get_tosa_compile_spec(tosa_spec="TOSA-1.0+INT"),
+)
+ARM_ETHOS_U55_FLOW = _create_arm_flow(
+    "arm_ethos_u55",
+    common.get_u55_compile_spec(),
+)
+ARM_ETHOS_U85_FLOW = _create_arm_flow(
+    "arm_ethos_u85",
+    common.get_u85_compile_spec(),
+)
diff --git a/backends/test/suite/flows/qualcomm.py b/backends/test/suite/flows/qualcomm.py
index 9998caa51b6..99deb3d4877 100644
--- a/backends/test/suite/flows/qualcomm.py
+++ b/backends/test/suite/flows/qualcomm.py
@@ -42,7 +42,7 @@ def create_quantize_stage() -> Quantize:
 
 QNN_TEST_FLOW = _create_qnn_flow("qnn")
 QNN_16A16W_TEST_FLOW = _create_qnn_flow(
-    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_8a8w, use_fp16=False
+    "qnn_16a16w", quantize=True, quant_dtype=QuantDtype.use_16a16w, use_fp16=False
 )
 QNN_16A8W_TEST_FLOW = _create_qnn_flow(
     "qnn_16a8w", quantize=True, quant_dtype=QuantDtype.use_16a8w, use_fp16=False
diff --git a/backends/test/suite/generate_markdown_summary.py b/backends/test/suite/generate_markdown_summary.py
index 73da8fba678..e54fc691723 100644
--- a/backends/test/suite/generate_markdown_summary.py
+++ b/backends/test/suite/generate_markdown_summary.py
@@ -1,44 +1,69 @@
 import argparse
 import csv
+import json
 import sys
 
-#
-# A standalone script to generate a Markdown representation of a test report.
-# This is primarily intended to be used with GitHub actions to generate a nice
-# representation of the test results when looking at the action run.
-#
-# Usage: python executorch/backends/test/suite/generate_markdown_summary.py <path to test report CSV file>
-# Markdown is written to stdout.
-#
+from dataclasses import dataclass, field
 
 
-def escape_for_markdown(text: str) -> str:
+@dataclass
+class ResultCounts:
     """
-    Modify a string to properly display in a markdown table cell.
+    Represents aggregated result counts for each status.
     """
-    if not text:
-        return text
 
-    # Replace newlines with <br /> tags
-    escaped = text.replace("\n", "<br />")
+    total: int = 0
+    passes: int = 0
+    fails: int = 0
+    skips: int = 0
+    by_detail: dict[str, int] = field(default_factory=lambda: {})
 
-    # Escape backslashes.
-    escaped = escaped.replace("\\", "\\\\")
+    def add_row(self, result_value: str, result_detail: str) -> None:
+        """
+        Update the result counts for the specified row.
+        """
 
-    # Escape pipe characters that would break table structure
-    escaped = escaped.replace("|", "\\|")
+        self.total += 1
 
-    return escaped
+        if result_value == "Pass":
+            self.passes += 1
+        elif result_value == "Fail":
+            self.fails += 1
+        elif result_value == "Skip":
+            self.skips += 1
+        else:
+            raise RuntimeError(f"Unknown result value {result_value}")
 
+        if result_detail:
+            if result_detail not in self.by_detail:
+                self.by_detail[result_detail] = 0
+
+            self.by_detail[result_detail] += 1
+
+
+@dataclass
+class AggregatedSummary:
+    """
+    Represents aggegrated summary data for the test run.
+    """
+
+    counts: ResultCounts
+    counts_by_params: dict[str, ResultCounts]
+    failed_tests: list[list[str]]
+    header: list[str]
+
+
+#
+# A standalone script to generate a Markdown representation of a test report.
+# This is primarily intended to be used with GitHub actions to generate a nice
+# representation of the test results when looking at the action run.
+#
+# Usage: python executorch/backends/test/suite/generate_markdown_summary.py <path to test report CSV file>
+# Markdown is written to stdout.
+#
 
-def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
-    # Print warning if exit code is non-zero
-    if exit_code != 0:
-        print("> [!WARNING]")
-        print(
-            f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n"
-        )
 
+def aggregate_results(csv_path: str) -> AggregatedSummary:
     with open(csv_path, newline="", encoding="utf-8") as f:
         reader = csv.reader(f)
         rows = list(reader)
@@ -46,24 +71,28 @@ def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
     header = rows[0]
     data_rows = rows[1:]
 
-    # Find the Result and Result Detail column indices
-    result_column_index = None
-    result_detail_column_index = None
-    for i, col in enumerate(header):
-        if col.lower() == "result":
-            result_column_index = i
-        elif col.lower() == "result detail":
-            result_detail_column_index = i
+    header_indices_by_name = {n.lower(): i for (i, n) in enumerate(header)}
+    params_column_index = header_indices_by_name.get("params", None)
+    result_column_index = header_indices_by_name["result"]
+    result_detail_column_index = header_indices_by_name["result detail"]
 
     # Count results and prepare data
-    pass_count = 0
-    fail_count = 0
-    skip_count = 0
+    counts = ResultCounts()
     failed_tests = []
-    processed_rows = []
-    result_detail_counts = {}
+    counts_by_param = {}
 
     for row in data_rows:
+        result = row[result_column_index]
+        result_detail = row[result_detail_column_index]
+
+        counts.add_row(result, result_detail)
+
+        params = row[params_column_index] if params_column_index else None
+        if params:
+            if params not in counts_by_param:
+                counts_by_param[params] = ResultCounts()
+            counts_by_param[params].add_row(result, result_detail)
+
         # Make a copy of the row to avoid modifying the original
         processed_row = [escape_for_markdown(cell) for cell in row]
 
@@ -71,54 +100,130 @@ def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
         if result_column_index is not None and result_column_index < len(row):
             result_value = row[result_column_index].strip().lower()
             if result_value == "pass":
-                pass_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:green">Pass</span>'
                 )
             elif result_value == "fail":
-                fail_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:red">Fail</span>'
                 )
                 failed_tests.append(processed_row.copy())
             elif result_value == "skip":
-                skip_count += 1
                 processed_row[result_column_index] = (
                     '<span style="color:gray">Skip</span>'
                 )
 
-        # Count result details (excluding empty ones)
-        if result_detail_column_index is not None and result_detail_column_index < len(
-            row
-        ):
-            result_detail_value = row[result_detail_column_index].strip()
-            if result_detail_value:  # Only count non-empty result details
-                if result_detail_value in result_detail_counts:
-                    result_detail_counts[result_detail_value] += 1
-                else:
-                    result_detail_counts[result_detail_value] = 1
+    return AggregatedSummary(
+        counts=counts,
+        failed_tests=failed_tests,
+        counts_by_params=counts_by_param,
+        header=header,
+    )
+
+
+def escape_for_markdown(text: str) -> str:
+    """
+    Modify a string to properly display in a markdown table cell.
+    """
+    if not text:
+        return text
+
+    # Replace newlines with <br /> tags
+    escaped = text.replace("\n", "<br />")
 
-        processed_rows.append(processed_row)
+    # Escape backslashes.
+    escaped = escaped.replace("\\", "\\\\")
+
+    # Escape pipe characters that would break table structure
+    escaped = escaped.replace("|", "\\|")
+
+    return escaped
+
+
+def generate_markdown(csv_path: str, exit_code: int = 0):  # noqa (C901)
+    # Print warning if exit code is non-zero
+    if exit_code != 0:
+        print("> [!WARNING]")
+        print(
+            f"> Exit code {exit_code} was non-zero. Test process may have crashed. Check the job logs for more information.\n"
+        )
+
+    results = aggregate_results(csv_path)
 
     # Generate Summary section
-    total_rows = len(data_rows)
     print("# Summary\n")
-    print(f"- **Pass**: {pass_count}/{total_rows}")
-    print(f"- **Fail**: {fail_count}/{total_rows}")
-    print(f"- **Skip**: {skip_count}/{total_rows}")
+    total_excluding_skips = results.counts.passes + results.counts.fails
+    pass_fraction = results.counts.passes / total_excluding_skips
+    fail_fraction = results.counts.fails / total_excluding_skips
+    print(
+        f"- **Pass**: {results.counts.passes}/{total_excluding_skips} ({pass_fraction*100:.2f}%)"
+    )
+    print(
+        f"- **Fail**: {results.counts.fails}/{total_excluding_skips} ({fail_fraction*100:.2f}%)"
+    )
+    print(f"- **Skip**: {results.counts.skips}")
+
+    if results.counts_by_params:
+        print("\n## Results by Parameters\n")
+
+        # Extract all unique parameter keys from the JSON strings
+        all_param_keys = set()
+        parsed_params = {}
+
+        for params_str in results.counts_by_params.keys():
+            # Parse the JSON string (it's a string representation of a dict)
+            params_dict = json.loads(params_str)
+            parsed_params[params_str] = params_dict
+            all_param_keys.update(params_dict.keys())
+
+        if parsed_params and len(parsed_params) > 1:
+            # Sort parameter keys for consistent column ordering
+            sorted_param_keys = sorted(all_param_keys)
+
+            # Create table header
+            header_cols = sorted_param_keys + ["Pass", "Fail", "Skip", "Pass %"]
+            print("| " + " | ".join(header_cols) + " |")
+            print("|" + "|".join(["---"] * len(header_cols)) + "|")
+
+            # Create table rows
+            for params_str, counts in results.counts_by_params.items():
+                if params_str in parsed_params:
+                    params_dict = parsed_params[params_str]
+                    row_values = []
+
+                    # Add parameter values
+                    for key in sorted_param_keys:
+                        value = params_dict.get(key, "")
+                        row_values.append(str(value))
+
+                    pass_fraction = counts.passes / (counts.passes + counts.fails)
+
+                    # Add count values
+                    row_values.extend(
+                        [
+                            str(counts.passes),
+                            str(counts.fails),
+                            str(counts.skips),
+                            f"{pass_fraction*100:.2f}%",
+                        ]
+                    )
+
+                    print("| " + " | ".join(row_values) + " |")
+
+        print()
 
     print("## Failure Breakdown:")
-    total_rows_with_result_detail = sum(result_detail_counts.values())
-    for detail, count in sorted(result_detail_counts.items()):
+    total_rows_with_result_detail = sum(results.counts.by_detail.values())
+    for detail, count in sorted(results.counts.by_detail.items()):
         print(f"- **{detail}**: {count}/{total_rows_with_result_detail}")
 
     # Generate Failed Tests section
     print("# Failed Tests\n")
-    if failed_tests:
-        escaped_header = [escape_for_markdown(col) for col in header]
+    if results.failed_tests:
+        escaped_header = [escape_for_markdown(col) for col in results.header]
         print("| " + " | ".join(escaped_header) + " |")
-        print("|" + "|".join(["---"] * len(header)) + "|")
-        for row in failed_tests:
+        print("|" + "|".join(["---"] * len(results.header)) + "|")
+        for row in results.failed_tests:
             print("| " + " | ".join(row) + " |")
     else:
         print("No failed tests.\n")
diff --git a/backends/test/suite/generate_markdown_summary_json.py b/backends/test/suite/generate_markdown_summary_json.py
new file mode 100644
index 00000000000..4b6edc2a635
--- /dev/null
+++ b/backends/test/suite/generate_markdown_summary_json.py
@@ -0,0 +1,229 @@
+import argparse
+import json
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ResultCounts:
+    """
+    Represents aggregated result counts for each status.
+    """
+
+    total: int = 0
+    passes: int = 0
+    fails: int = 0
+    skips: int = 0
+    by_detail: dict[str, int] = field(default_factory=lambda: {})
+
+    def add_row(self, result_value: str, result_detail: str) -> None:
+        """
+        Update the result counts for the specified row.
+        """
+
+        self.total += 1
+
+        if result_value == "Pass":
+            self.passes += 1
+        elif result_value == "Fail":
+            self.fails += 1
+        elif result_value == "Skip":
+            self.skips += 1
+        else:
+            raise RuntimeError(f"Unknown result value {result_value}")
+
+        if result_detail:
+            if result_detail not in self.by_detail:
+                self.by_detail[result_detail] = 0
+
+            self.by_detail[result_detail] += 1
+
+
+@dataclass
+class AggregatedSummary:
+    """
+    Represents aggegrated summary data for the test run.
+    """
+
+    counts: ResultCounts
+    counts_by_params: dict[str, ResultCounts]
+    failed_tests: list[list[str]]
+
+
+#
+# A standalone script to generate a Markdown representation of a test report.
+# This is primarily intended to be used with GitHub actions to generate a nice
+# representation of the test results when looking at the action run.
+#
+# Usage: python executorch/backends/test/suite/generate_markdown_summary.py <path to test report CSV file>
+# Markdown is written to stdout.
+#
+
+
+def aggregate_results(json_path: str) -> AggregatedSummary:
+    with open(json_path) as f:
+        data = json.load(f)
+
+    # Count results and prepare data
+    counts = ResultCounts()
+    failed_tests = []
+    counts_by_param = {}
+
+    for test_data in data["tests"]:
+        result_meta = test_data["metadata"]
+        for subtest_meta in result_meta["subtests"]:
+            result = subtest_meta["Result"]
+            result_detail = subtest_meta.get("Result Detail") or ""
+
+            counts.add_row(result, result_detail)
+
+            test_id = subtest_meta["Test ID"]
+            base_test = subtest_meta["Test Case"]
+            params = test_id[len(base_test) + 1 : -1]
+
+            if params:
+                if params not in counts_by_param:
+                    counts_by_param[params] = ResultCounts()
+                counts_by_param[params].add_row(result, result_detail)
+
+            if result.lower() == "fail":
+                failed_tests.append(subtest_meta)
+
+    return AggregatedSummary(
+        counts=counts,
+        failed_tests=failed_tests,
+        counts_by_params=counts_by_param,
+    )
+
+
+def escape_for_markdown(text: str) -> str:
+    """
+    Modify a string to properly display in a markdown table cell.
+    """
+    if not text:
+        return text
+
+    # Replace newlines with <br /> tags
+    escaped = text.replace("\n", "<br />")
+
+    # Escape backslashes.
+    escaped = escaped.replace("\\", "\\\\")
+
+    # Escape pipe characters that would break table structure
+    escaped = escaped.replace("|", "\\|")
+
+    return escaped
+
+
+def generate_markdown(json_path: str, exit_code: int = 0):  # noqa (C901)
+    results = aggregate_results(json_path)
+
+    # Generate Summary section
+    print("# Summary\n")
+    total_excluding_skips = results.counts.passes + results.counts.fails
+    pass_fraction = results.counts.passes / total_excluding_skips
+    fail_fraction = results.counts.fails / total_excluding_skips
+    print(
+        f"- **Pass**: {results.counts.passes}/{total_excluding_skips} ({pass_fraction*100:.2f}%)"
+    )
+    print(
+        f"- **Fail**: {results.counts.fails}/{total_excluding_skips} ({fail_fraction*100:.2f}%)"
+    )
+    print(f"- **Skip**: {results.counts.skips}")
+
+    if results.counts_by_params:
+        print("\n## Results by Parameters\n")
+
+        if len(results.counts_by_params) > 0:
+            # Create table header
+            header_cols = ["Params", "Pass", "Fail", "Skip", "Pass %"]
+            print("| " + " | ".join(header_cols) + " |")
+            print("|" + "|".join(["---"] * len(header_cols)) + "|")
+
+            # Create table rows
+            for params_str, counts in results.counts_by_params.items():
+                row_values = [params_str]
+
+                # Add parameter values
+                pass_fraction = counts.passes / (counts.passes + counts.fails)
+
+                # Add count values
+                row_values.extend(
+                    [
+                        str(counts.passes),
+                        str(counts.fails),
+                        str(counts.skips),
+                        f"{pass_fraction*100:.2f}%",
+                    ]
+                )
+
+                print("| " + " | ".join(row_values) + " |")
+
+        print()
+
+    print("## Failure Breakdown:")
+    total_rows_with_result_detail = sum(results.counts.by_detail.values())
+    for detail, count in sorted(results.counts.by_detail.items()):
+        print(f"- **{detail}**: {count}/{total_rows_with_result_detail}")
+
+    # Generate Failed Tests section
+    print("# Failed Tests\n")
+    print(
+        "To reproduce, run the following command from the root of the ExecuTorch repository:"
+    )
+    print("```")
+    print('pytest -c /dev/nul backends/test/suite/ -k "<test_id>"')
+    print("```")
+    if results.failed_tests:
+        header = build_header(results.failed_tests)
+
+        escaped_header = [escape_for_markdown(col) for col in header.keys()]
+        print("| " + " | ".join(escaped_header) + " |")
+        print("|" + "|".join(["---"] * len(escaped_header)) + "|")
+        for rec in results.failed_tests:
+            row = build_row(rec, header)
+            print("| " + " | ".join(row) + " |")
+    else:
+        print("No failed tests.\n")
+
+
+def build_header(data) -> dict[str, int]:
+    """
+    Find the union of all keys and return a dict of header keys and indices. Try to preserve
+    ordering as much as possible.
+    """
+
+    keys = max(data, key=len)
+
+    header = {k: i for (i, k) in enumerate(keys)}
+
+    for rec in data:
+        keys = set(rec.keys())
+        for k in keys:
+            if k not in header:
+                header[k] = len(header)
+
+    return header
+
+
+def build_row(rec, header: dict[str, int]) -> list[str]:
+    row = [""] * len(header)
+    for k, v in rec.items():
+        row[header[k]] = escape_for_markdown(str(v))
+    return row
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate a Markdown representation of a test report."
+    )
+    parser.add_argument("json_path", help="Path to the test report CSV file.")
+    parser.add_argument(
+        "--exit-code", type=int, default=0, help="Exit code from the test process."
+    )
+    args = parser.parse_args()
+    generate_markdown(args.json_path, args.exit_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backends/test/suite/models/__init__.py b/backends/test/suite/models/__init__.py
index ea44275a463..6ac1a72bde6 100644
--- a/backends/test/suite/models/__init__.py
+++ b/backends/test/suite/models/__init__.py
@@ -5,136 +5,3 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
-
-import itertools
-import os
-import unittest
-from typing import Any, Callable
-
-import torch
-from executorch.backends.test.suite import get_test_flows
-from executorch.backends.test.suite.context import get_active_test_context, TestContext
-from executorch.backends.test.suite.flow import TestFlow
-from executorch.backends.test.suite.reporting import log_test_summary
-from executorch.backends.test.suite.runner import run_test
-
-
-DTYPES: list[torch.dtype] = [
-    torch.float16,
-    torch.float32,
-]
-
-
-def load_tests(loader, suite, pattern):
-    package_dir = os.path.dirname(__file__)
-    discovered_suite = loader.discover(
-        start_dir=package_dir, pattern=pattern or "test_*.py"
-    )
-    suite.addTests(discovered_suite)
-    return suite
-
-
-def _create_test(
-    cls,
-    test_func: Callable,
-    flow: TestFlow,
-    dtype: torch.dtype,
-    use_dynamic_shapes: bool,
-):
-    dtype_name = str(dtype)[6:]  # strip "torch."
-    test_name = f"{test_func.__name__}_{flow.name}_{dtype_name}"
-    if use_dynamic_shapes:
-        test_name += "_dynamic_shape"
-
-    def wrapped_test(self):
-        params = {
-            "dtype": dtype,
-            "use_dynamic_shapes": use_dynamic_shapes,
-        }
-        with TestContext(test_name, test_func.__name__, flow.name, params):
-            if flow.should_skip_test(test_name):
-                raise unittest.SkipTest(
-                    f"Skipping test due to matching flow {flow.name} skip patterns"
-                )
-
-            test_func(self, flow, dtype, use_dynamic_shapes)
-
-    wrapped_test._name = test_func.__name__  # type: ignore
-    wrapped_test._flow = flow  # type: ignore
-
-    setattr(cls, test_name, wrapped_test)
-
-
-# Expand a test into variants for each registered flow.
-def _expand_test(cls, test_name: str) -> None:
-    test_func = getattr(cls, test_name)
-    supports_dynamic_shapes = getattr(test_func, "supports_dynamic_shapes", True)
-    dynamic_shape_values = [True, False] if supports_dynamic_shapes else [False]
-    dtypes = getattr(test_func, "dtypes", DTYPES)
-
-    for flow, dtype, use_dynamic_shapes in itertools.product(
-        get_test_flows().values(), dtypes, dynamic_shape_values
-    ):
-        _create_test(cls, test_func, flow, dtype, use_dynamic_shapes)
-    delattr(cls, test_name)
-
-
-def model_test_cls(cls) -> Callable | None:
-    """Decorator for model tests. Handles generating test variants for each test flow and configuration."""
-    for key in dir(cls):
-        if key.startswith("test_"):
-            _expand_test(cls, key)
-    return cls
-
-
-def model_test_params(
-    supports_dynamic_shapes: bool = True,
-    dtypes: list[torch.dtype] | None = None,
-) -> Callable:
-    """Optional parameter decorator for model tests. Specifies test pararameters. Only valid with a class decorated by model_test_cls."""
-
-    def inner_decorator(func: Callable) -> Callable:
-        func.supports_dynamic_shapes = supports_dynamic_shapes  # type: ignore
-
-        if dtypes is not None:
-            func.dtypes = dtypes  # type: ignore
-
-        return func
-
-    return inner_decorator
-
-
-def run_model_test(
-    model: torch.nn.Module,
-    inputs: tuple[Any],
-    flow: TestFlow,
-    dtype: torch.dtype,
-    dynamic_shapes: Any | None,
-):
-    model = model.to(dtype)
-    context = get_active_test_context()
-
-    # This should be set in the wrapped test. See _create_test above.
-    assert context is not None, "Missing test context."
-
-    run_summary = run_test(
-        model,
-        inputs,
-        flow,
-        context.test_name,
-        context.test_base_name,
-        0,  # subtest_index - currently unused for model tests
-        context.params,
-        dynamic_shapes=dynamic_shapes,
-    )
-
-    log_test_summary(run_summary)
-
-    if not run_summary.result.is_success():
-        if run_summary.result.is_backend_failure():
-            raise RuntimeError("Test failure.") from run_summary.error
-        else:
-            # Non-backend failure indicates a bad test. Mark as skipped.
-            raise unittest.SkipTest(
-                f"Test failed for reasons other than backend failure. Error: {run_summary.error}"
-            )
diff --git a/backends/test/suite/models/test_torchaudio.py b/backends/test/suite/models/test_torchaudio.py
index 69f6de4684f..2287b226c37 100644
--- a/backends/test/suite/models/test_torchaudio.py
+++ b/backends/test/suite/models/test_torchaudio.py
@@ -9,15 +9,11 @@
 import unittest
 from typing import Tuple
 
+import pytest
 import torch
 import torchaudio
 
-from executorch.backends.test.suite.flow import TestFlow
-from executorch.backends.test.suite.models import (
-    model_test_cls,
-    model_test_params,
-    run_model_test,
-)
+from executorch.backends.test.suite import dtype_to_str
 from torch.export import Dim
 
 #
@@ -47,64 +43,68 @@ def forward(
         return x.transpose(0, 1)
 
 
-@model_test_cls
-class TorchAudio(unittest.TestCase):
-    @model_test_params(dtypes=[torch.float32], supports_dynamic_shapes=False)
-    def test_conformer(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        inner_model = torchaudio.models.Conformer(
-            input_dim=80,
-            num_heads=4,
-            ffn_dim=128,
-            num_layers=4,
-            depthwise_conv_kernel_size=31,
-        )
-        model = PatchedConformer(inner_model)
-        lengths = torch.randint(1, 400, (10,))
+@pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str)
+@pytest.mark.parametrize("use_dynamic_shapes", [False], ids=["static_shapes"])
+def test_conformer(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    inner_model = torchaudio.models.Conformer(
+        input_dim=80,
+        num_heads=4,
+        ffn_dim=128,
+        num_layers=4,
+        depthwise_conv_kernel_size=31,
+    )
+    model = PatchedConformer(inner_model).eval().to(dtype)
+    lengths = torch.randint(1, 400, (10,))
 
-        encoder_padding_mask = torchaudio.models.conformer._lengths_to_padding_mask(
-            lengths
-        )
-        inputs = (
-            torch.rand(10, int(lengths.max()), 80),
-            encoder_padding_mask,
-        )
+    encoder_padding_mask = torchaudio.models.conformer._lengths_to_padding_mask(lengths)
+    inputs = (
+        torch.rand(10, int(lengths.max()), 80),
+        encoder_padding_mask,
+    )
+
+    test_runner.lower_and_run_model(model, inputs)
 
-        run_model_test(model, inputs, flow, dtype, None)
-
-    @model_test_params(dtypes=[torch.float32])
-    def test_wav2letter(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchaudio.models.Wav2Letter()
-        inputs = (torch.randn(1, 1, 1024, dtype=dtype),)
-        dynamic_shapes = (
-            {
-                "x": {
-                    2: Dim("d", min=900, max=1024),
-                }
+
+@pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str)
+@pytest.mark.parametrize(
+    "use_dynamic_shapes", [False, True], ids=["static_shapes", "dynamic_shapes"]
+)
+def test_wav2letter(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchaudio.models.Wav2Letter().to(dtype)
+    inputs = (torch.randn(1, 1, 1024, dtype=dtype),)
+    dynamic_shapes = (
+        {
+            "x": {
+                2: Dim("d", min=900, max=1024),
             }
-            if use_dynamic_shapes
-            else None
-        )
-        run_model_test(model, inputs, flow, dtype, dynamic_shapes)
-
-    @unittest.skip("This model times out on all backends.")
-    def test_wavernn(
-        self,
-        flow: TestFlow,
-        dtype: torch.dtype,
-        use_dynamic_shapes: bool,
-    ):
-        model = torchaudio.models.WaveRNN(
+        }
+        if use_dynamic_shapes
+        else None
+    )
+
+    test_runner.lower_and_run_model(model, inputs, dynamic_shapes=dynamic_shapes)
+
+
+@pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str)
+@pytest.mark.parametrize("use_dynamic_shapes", [False], ids=["static_shapes"])
+@unittest.skip("This model times out on all backends.")
+def test_wavernn(
+    test_runner,
+    dtype: torch.dtype,
+    use_dynamic_shapes: bool,
+):
+    model = (
+        torchaudio.models.WaveRNN(
             upsample_scales=[5, 5, 8], n_classes=512, hop_length=200
-        ).eval()
-
-        # See https://docs.pytorch.org/audio/stable/generated/torchaudio.models.WaveRNN.html#forward
-        inputs = (
-            torch.randn(1, 1, (64 - 5 + 1) * 200),  # waveform
-            torch.randn(1, 1, 128, 64),  # specgram
         )
+        .eval()
+        .to(dtype)
+    )
+
+    # See https://docs.pytorch.org/audio/stable/generated/torchaudio.models.WaveRNN.html#forward
+    inputs = (
+        torch.randn(1, 1, (64 - 5 + 1) * 200).to(dtype),  # waveform
+        torch.randn(1, 1, 128, 64).to(dtype),  # specgram
+    )
 
-        run_model_test(model, inputs, flow, dtype, None)
+    test_runner.lower_and_run_model(model, inputs)
diff --git a/backends/test/suite/models/test_torchvision.py b/backends/test/suite/models/test_torchvision.py
index e69de80a871..58cf6a990d4 100644
--- a/backends/test/suite/models/test_torchvision.py
+++ b/backends/test/suite/models/test_torchvision.py
@@ -6,17 +6,12 @@
 
 # pyre-unsafe
 
-import unittest
+import pytest
 
 import torch
 import torchvision
+from executorch.backends.test.suite import dtype_to_str
 
-from executorch.backends.test.suite.flow import TestFlow
-from executorch.backends.test.suite.models import (
-    model_test_cls,
-    model_test_params,
-    run_model_test,
-)
 from torch.export import Dim
 
 #
@@ -25,148 +20,175 @@
 # multiple size variants, one small or medium variant is used.
 #
 
+PARAMETERIZE_DTYPE = pytest.mark.parametrize("dtype", [torch.float32], ids=dtype_to_str)
+PARAMETERIZE_DYNAMIC_SHAPES = pytest.mark.parametrize(
+    "use_dynamic_shapes", [False, True], ids=["static_shapes", "dynamic_shapes"]
+)
+PARAMETERIZE_STATIC_ONLY = pytest.mark.parametrize(
+    "use_dynamic_shapes", [False], ids=["static_shapes"]
+)
+
+
+def _test_cv_model(
+    model: torch.nn.Module,
+    test_runner,
+    dtype: torch.dtype,
+    use_dynamic_shapes: bool,
+):
+    model = model.eval().to(dtype)
+
+    # Test a CV model that follows the standard conventions.
+    inputs = (torch.randn(1, 3, 224, 224, dtype=dtype),)
 
-@model_test_cls
-class TorchVision(unittest.TestCase):
-    def _test_cv_model(
-        self,
-        model: torch.nn.Module,
-        flow: TestFlow,
-        dtype: torch.dtype,
-        use_dynamic_shapes: bool,
-    ):
-        # Test a CV model that follows the standard conventions.
-        inputs = (torch.randn(1, 3, 224, 224, dtype=dtype),)
-
-        dynamic_shapes = (
-            (
-                {
-                    2: Dim("height", min=1, max=16) * 16,
-                    3: Dim("width", min=1, max=16) * 16,
-                },
-            )
-            if use_dynamic_shapes
-            else None
+    dynamic_shapes = (
+        (
+            {
+                2: Dim("height", min=1, max=16) * 16,
+                3: Dim("width", min=1, max=16) * 16,
+            },
         )
+        if use_dynamic_shapes
+        else None
+    )
+
+    test_runner.lower_and_run_model(model, inputs, dynamic_shapes=dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_alexnet(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.alexnet()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_convnext_small(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.convnext_small()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_densenet161(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.densenet161()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_efficientnet_b4(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.efficientnet_b4()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_efficientnet_v2_s(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.efficientnet_v2_s()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_googlenet(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.googlenet()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_inception_v3(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.inception_v3()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_STATIC_ONLY
+def test_maxvit_t(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.maxvit_t()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_mnasnet1_0(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.mnasnet1_0()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_mobilenet_v2(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.mobilenet_v2()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_mobilenet_v3_small(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.mobilenet_v3_small()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_regnet_y_1_6gf(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.regnet_y_1_6gf()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_resnet50(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.resnet50()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_resnext50_32x4d(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.resnext50_32x4d()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_shufflenet_v2_x1_0(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.shufflenet_v2_x1_0()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_squeezenet1_1(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.squeezenet1_1()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_swin_v2_t(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.swin_v2_t()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_vgg11(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.vgg11()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
+
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_STATIC_ONLY
+def test_vit_b_16(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.vit_b_16()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
+
 
-        run_model_test(model, inputs, flow, dtype, dynamic_shapes)
-
-    def test_alexnet(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.alexnet()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_convnext_small(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.convnext_small()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_densenet161(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.densenet161()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_efficientnet_b4(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.efficientnet_b4()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_efficientnet_v2_s(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.efficientnet_v2_s()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_googlenet(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.googlenet()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_inception_v3(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.inception_v3()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    @model_test_params(supports_dynamic_shapes=False)
-    def test_maxvit_t(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.maxvit_t()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_mnasnet1_0(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.mnasnet1_0()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_mobilenet_v2(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.mobilenet_v2()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_mobilenet_v3_small(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.mobilenet_v3_small()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_regnet_y_1_6gf(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.regnet_y_1_6gf()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_resnet50(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.resnet50()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_resnext50_32x4d(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.resnext50_32x4d()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_shufflenet_v2_x1_0(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.shufflenet_v2_x1_0()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_squeezenet1_1(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.squeezenet1_1()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_swin_v2_t(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.swin_v2_t()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_vgg11(self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool):
-        model = torchvision.models.vgg11()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    @model_test_params(supports_dynamic_shapes=False)
-    def test_vit_b_16(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.vit_b_16()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
-
-    def test_wide_resnet50_2(
-        self, flow: TestFlow, dtype: torch.dtype, use_dynamic_shapes: bool
-    ):
-        model = torchvision.models.wide_resnet50_2()
-        self._test_cv_model(model, flow, dtype, use_dynamic_shapes)
+@PARAMETERIZE_DTYPE
+@PARAMETERIZE_DYNAMIC_SHAPES
+def test_wide_resnet50_2(test_runner, dtype: torch.dtype, use_dynamic_shapes: bool):
+    model = torchvision.models.wide_resnet50_2()
+    _test_cv_model(model, test_runner, dtype, use_dynamic_shapes)
diff --git a/backends/test/suite/operators/__init__.py b/backends/test/suite/operators/__init__.py
index 9c550b3a49c..7475af29e15 100644
--- a/backends/test/suite/operators/__init__.py
+++ b/backends/test/suite/operators/__init__.py
@@ -6,19 +6,14 @@
 
 # pyre-unsafe
 
-import copy
 import os
+import sys
 import unittest
 
 from enum import Enum
-from typing import Callable
 
+import pytest
 import torch
-from executorch.backends.test.suite import get_test_flows
-from executorch.backends.test.suite.context import get_active_test_context, TestContext
-from executorch.backends.test.suite.flow import TestFlow
-from executorch.backends.test.suite.reporting import log_test_summary
-from executorch.backends.test.suite.runner import run_test
 
 
 def load_tests(loader, suite, pattern):
@@ -66,112 +61,48 @@ def dtype_test(func):
     return func
 
 
-# Class annotation for operator tests. This triggers the test framework to register
-# the tests.
-def operator_test(cls):
-    _create_tests(cls)
-    return cls
-
-
-# Generate test cases for each backend flow.
-def _create_tests(cls):
-    for key in dir(cls):
-        if key.startswith("test_"):
-            _expand_test(cls, key)
-
-
-# Expand a test into variants for each registered flow.
-def _expand_test(cls, test_name: str):
-    test_func = getattr(cls, test_name)
-    for flow in get_test_flows().values():
-        _create_test_for_backend(cls, test_func, flow)
-    delattr(cls, test_name)
+class OperatorTest(unittest.TestCase):
+    pass
 
 
-def _make_wrapped_test(
-    test_func: Callable,
-    test_name: str,
-    test_base_name: str,
-    flow: TestFlow,
-    params: dict | None = None,
-):
-    def wrapped_test(self):
-        with TestContext(test_name, test_base_name, flow.name, params):
-            if flow.should_skip_test(test_name):
-                raise unittest.SkipTest(
-                    f"Skipping test due to matching flow {flow.name} skip patterns"
-                )
+class TestCaseShim:
+    def __init__(self, test_runner):
+        self._test_runner = test_runner
 
-            test_kwargs = copy.copy(params) or {}
-            test_kwargs["flow"] = flow
+    def _test_op(self, model, args, flow, generate_random_test_inputs=True):
+        self._test_runner.lower_and_run_model(
+            model, args, generate_random_test_inputs=generate_random_test_inputs
+        )
 
-            test_func(self, **test_kwargs)
 
-    wrapped_test._name = test_name
-    wrapped_test._flow = flow
+def wrap_test(original_func, test_type):
+    if test_type == TestType.STANDARD:
 
-    return wrapped_test
+        def wrapped_func(test_runner):
+            shim = TestCaseShim(test_runner)
+            original_func(shim, test_runner._flow)
 
+        return wrapped_func
+    elif test_type == TestType.DTYPE:
 
-def _create_test_for_backend(
-    cls,
-    test_func: Callable,
-    flow: TestFlow,
-):
-    test_type = getattr(test_func, "test_type", TestType.STANDARD)
+        @pytest.mark.parametrize("dtype", [torch.float32], ids=lambda s: str(s)[6:])
+        def wrapped_func(test_runner, dtype):
+            shim = TestCaseShim(test_runner)
+            original_func(shim, test_runner._flow, dtype)
 
-    if test_type == TestType.STANDARD:
-        test_name = f"{test_func.__name__}_{flow.name}"
-        wrapped_test = _make_wrapped_test(
-            test_func, test_name, test_func.__name__, flow
-        )
-        setattr(cls, test_name, wrapped_test)
-    elif test_type == TestType.DTYPE:
-        for dtype in DTYPES:
-            dtype_name = str(dtype)[6:]  # strip "torch."
-            test_name = f"{test_func.__name__}_{dtype_name}_{flow.name}"
-            wrapped_test = _make_wrapped_test(
-                test_func,
-                test_name,
-                test_func.__name__,
-                flow,
-                {"dtype": dtype},
-            )
-            setattr(cls, test_name, wrapped_test)
+        return wrapped_func
     else:
-        raise NotImplementedError(f"Unknown test type {test_type}.")
+        raise ValueError()
 
 
-class OperatorTest(unittest.TestCase):
-    def _test_op(
-        self, model, inputs, flow: TestFlow, generate_random_test_inputs: bool = True
-    ):
-        context = get_active_test_context()
-
-        # This should be set in the wrapped test. See _make_wrapped_test above.
-        assert context is not None, "Missing test context."
-
-        run_summary = run_test(
-            model,
-            inputs,
-            flow,
-            context.test_name,
-            context.test_base_name,
-            context.subtest_index,
-            context.params,
-            generate_random_test_inputs=generate_random_test_inputs,
-        )
-
-        log_test_summary(run_summary)
+def operator_test(cls):
+    parent_module = sys.modules[cls.__module__]
 
-        # This is reset when a new test is started - it creates the context per-test.
-        context.subtest_index = context.subtest_index + 1
+    for func_name in dir(cls):
+        if func_name.startswith("test"):
+            original_func = getattr(cls, func_name)
+            test_type = getattr(original_func, "test_type", TestType.STANDARD)
+            wrapped_func = wrap_test(original_func, test_type)
+            setattr(parent_module, func_name, wrapped_func)
 
-        if not run_summary.result.is_success():
-            if run_summary.result.is_backend_failure():
-                raise RuntimeError("Test failure.") from run_summary.error
-            else:
-                # Non-backend failure indicates a bad test. Mark as skipped.
-                raise unittest.SkipTest(
-                    f"Test failed for reasons other than backend failure. Error: {run_summary.error}"
-                )
+    return None
diff --git a/backends/test/suite/operators/test_add.py b/backends/test/suite/operators/test_add.py
index 6b21c3bf985..850e6f5132c 100644
--- a/backends/test/suite/operators/test_add.py
+++ b/backends/test/suite/operators/test_add.py
@@ -7,14 +7,8 @@
 # pyre-unsafe
 
 
+import pytest
 import torch
-from executorch.backends.test.suite.flow import TestFlow
-
-from executorch.backends.test.suite.operators import (
-    dtype_test,
-    operator_test,
-    OperatorTest,
-)
 
 
 class Model(torch.nn.Module):
@@ -31,55 +25,52 @@ def forward(self, x, y):
         return torch.add(x, y, alpha=self.alpha)
 
 
-@operator_test
-class Add(OperatorTest):
-    @dtype_test
-    def test_add_dtype(self, flow: TestFlow, dtype) -> None:
-        self._test_op(
-            Model(),
-            (
-                (torch.rand(2, 10) * 100).to(dtype),
-                (torch.rand(2, 10) * 100).to(dtype),
-            ),
-            flow,
-        )
-
-    def test_add_f32_bcast_first(self, flow: TestFlow) -> None:
-        self._test_op(
-            Model(),
-            (
-                torch.randn(5),
-                torch.randn(1, 5, 1, 5),
-            ),
-            flow,
-        )
-
-    def test_add_f32_bcast_second(self, flow: TestFlow) -> None:
-        self._test_op(
-            Model(),
-            (
-                torch.randn(4, 4, 2, 7),
-                torch.randn(2, 7),
-            ),
-            flow,
-        )
-
-    def test_add_f32_bcast_unary(self, flow: TestFlow) -> None:
-        self._test_op(
-            Model(),
-            (
-                torch.randn(5),
-                torch.randn(1, 1, 5),
-            ),
-            flow,
-        )
-
-    def test_add_f32_alpha(self, flow: TestFlow) -> None:
-        self._test_op(
-            ModelAlpha(alpha=2),
-            (
-                torch.randn(1, 25),
-                torch.randn(1, 25),
-            ),
-            flow,
-        )
+@pytest.mark.parametrize("dtype", [torch.float32], ids=lambda s: str(s)[6:])
+def test_add_dtype(test_runner, dtype) -> None:
+    test_runner.lower_and_run_model(
+        Model(),
+        (
+            (torch.rand(2, 10) * 100).to(dtype),
+            (torch.rand(2, 10) * 100).to(dtype),
+        ),
+    )
+
+
+def test_add_f32_bcast_first(test_runner) -> None:
+    test_runner.lower_and_run_model(
+        Model(),
+        (
+            torch.randn(5),
+            torch.randn(1, 5, 1, 5),
+        ),
+    )
+
+
+def test_add_f32_bcast_second(test_runner) -> None:
+    test_runner.lower_and_run_model(
+        Model(),
+        (
+            torch.randn(4, 4, 2, 7),
+            torch.randn(2, 7),
+        ),
+    )
+
+
+def test_add_f32_bcast_unary(test_runner) -> None:
+    test_runner.lower_and_run_model(
+        Model(),
+        (
+            torch.randn(5),
+            torch.randn(1, 1, 5),
+        ),
+    )
+
+
+def test_add_f32_alpha(test_runner) -> None:
+    test_runner.lower_and_run_model(
+        ModelAlpha(alpha=2),
+        (
+            torch.randn(1, 25),
+            torch.randn(1, 25),
+        ),
+    )
diff --git a/backends/test/suite/operators/test_lstm.py b/backends/test/suite/operators/test_lstm.py
index 91dd73c9052..11632e1e055 100644
--- a/backends/test/suite/operators/test_lstm.py
+++ b/backends/test/suite/operators/test_lstm.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -15,6 +16,11 @@
     operator_test,
     OperatorTest,
 )
+from torch.nn.quantizable.modules.rnn import LSTM as QuantizableLSTM
+
+
+def _get_lstm_cls(use_quantizable_lstm: bool):
+    return QuantizableLSTM if use_quantizable_lstm else torch.nn.LSTM
 
 
 class Model(torch.nn.Module):
@@ -27,9 +33,11 @@ def __init__(
         batch_first=True,
         dropout=0.0,
         bidirectional=False,
+        use_quantizable_lstm: bool = False,
     ):
         super().__init__()
-        self.lstm = torch.nn.LSTM(
+        lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+        self.lstm = lstm_cls(
             input_size=input_size,
             hidden_size=hidden_size,
             num_layers=num_layers,
@@ -47,106 +55,133 @@ def forward(self, x):
 class LSTM(OperatorTest):
     @dtype_test
     def test_lstm_dtype(self, flow: TestFlow, dtype) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2).to(dtype),
+            Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm).to(dtype),
             ((torch.rand(1, 10, 64) * 10).to(dtype),),  # (batch=1, seq_len, input_size)
             flow,
         )
 
     @dtype_test
     def test_lstm_no_bias_dtype(self, flow: TestFlow, dtype) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2, bias=False).to(dtype),
+            Model(
+                num_layers=2, bias=False, use_quantizable_lstm=use_quantizable_lstm
+            ).to(dtype),
             ((torch.rand(1, 10, 64) * 10).to(dtype),),
             flow,
         )
 
     def test_lstm_feature_sizes(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(input_size=32, hidden_size=16),
+            Model(
+                input_size=32,
+                hidden_size=16,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 8, 32),),  # (batch=1, seq_len, input_size)
             flow,
         )
         self._test_op(
-            Model(input_size=128, hidden_size=64),
+            Model(
+                input_size=128,
+                hidden_size=64,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 12, 128),),
             flow,
         )
         self._test_op(
-            Model(input_size=256, hidden_size=128),
+            Model(
+                input_size=256,
+                hidden_size=128,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 6, 256),),
             flow,
         )
         self._test_op(
-            Model(input_size=16, hidden_size=32),
+            Model(
+                input_size=16,
+                hidden_size=32,
+                use_quantizable_lstm=use_quantizable_lstm,
+            ),
             (torch.randn(1, 5, 16),),
             flow,
         )
 
     def test_lstm_batch_sizes(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(8, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(32, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(100, 10, 64),),
             flow,
         )
 
     def test_lstm_seq_lengths(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 5, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 20, 64),),
             flow,
         )
         self._test_op(
-            Model(),
+            Model(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 50, 64),),
             flow,
         )
 
     def test_lstm_batch_first_false(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(batch_first=False),
+            Model(batch_first=False, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(10, 1, 64),),  # (seq_len, batch=1, input_size)
             flow,
         )
 
     def test_lstm_num_layers(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2),
+            Model(num_layers=2, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
         self._test_op(
-            Model(num_layers=3),
+            Model(num_layers=3, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
 
     def test_lstm_bidirectional(self, flow: TestFlow) -> None:
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(bidirectional=True),
+            Model(bidirectional=True, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
 
     def test_lstm_with_dropout(self, flow: TestFlow) -> None:
         # Note: Dropout is only effective with num_layers > 1
+        use_quantizable_lstm = flow.quantize
         self._test_op(
-            Model(num_layers=2, dropout=0.2),
+            Model(num_layers=2, dropout=0.2, use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(1, 10, 64),),
             flow,
         )
@@ -154,9 +189,10 @@ def test_lstm_with_dropout(self, flow: TestFlow) -> None:
     def test_lstm_with_initial_states(self, flow: TestFlow) -> None:
         # Create a model that accepts initial states
         class ModelWithStates(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, use_quantizable_lstm: bool = False):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(
+                lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+                self.lstm = lstm_cls(
                     input_size=64,
                     hidden_size=32,
                     num_layers=2,
@@ -169,9 +205,10 @@ def forward(self, x, h0, c0):
         batch_size = 1
         num_layers = 2
         hidden_size = 32
+        use_quantizable_lstm = flow.quantize
 
         self._test_op(
-            ModelWithStates(),
+            ModelWithStates(use_quantizable_lstm=use_quantizable_lstm),
             (
                 torch.randn(batch_size, 10, 64),  # input
                 torch.randn(num_layers, batch_size, hidden_size),  # h0
@@ -183,9 +220,10 @@ def forward(self, x, h0, c0):
     def test_lstm_return_hidden_states(self, flow: TestFlow) -> None:
         # Create a model that returns both output and hidden states
         class ModelWithHiddenStates(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, use_quantizable_lstm: bool = False):
                 super().__init__()
-                self.lstm = torch.nn.LSTM(
+                lstm_cls = _get_lstm_cls(use_quantizable_lstm)
+                self.lstm = lstm_cls(
                     input_size=64,
                     hidden_size=32,
                     num_layers=2,
@@ -200,9 +238,10 @@ def forward(self, x):
         batch_size = 1
         seq_len = 10
         input_size = 64
+        use_quantizable_lstm = flow.quantize
 
         self._test_op(
-            ModelWithHiddenStates(),
+            ModelWithHiddenStates(use_quantizable_lstm=use_quantizable_lstm),
             (torch.randn(batch_size, seq_len, input_size),),
             flow,
         )
diff --git a/backends/test/suite/operators/test_rsqrt.py b/backends/test/suite/operators/test_rsqrt.py
index 705833194fb..bb51b213dd4 100644
--- a/backends/test/suite/operators/test_rsqrt.py
+++ b/backends/test/suite/operators/test_rsqrt.py
@@ -37,15 +37,28 @@ def test_rsqrt_dtype(self, flow: TestFlow, dtype) -> None:
 
     def test_rsqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
-
         # 1D tensor
-        self._test_op(RsqrtModel(), (torch.rand(20) + 0.01,), flow)
-
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(20) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
         # 2D tensor
-        self._test_op(RsqrtModel(), (torch.rand(5, 10) + 0.01,), flow)
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(5, 10) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
         # 3D tensor
-        self._test_op(RsqrtModel(), (torch.rand(3, 4, 5) + 0.01,), flow)
+        self._test_op(
+            RsqrtModel(),
+            (torch.rand(3, 4, 5) + 0.01,),
+            flow,
+            generate_random_test_inputs=False,
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_rsqrt_edge_cases(self, flow: TestFlow) -> None:
diff --git a/backends/test/suite/operators/test_sqrt.py b/backends/test/suite/operators/test_sqrt.py
index 3d327ade6a5..92fbc64878e 100644
--- a/backends/test/suite/operators/test_sqrt.py
+++ b/backends/test/suite/operators/test_sqrt.py
@@ -39,13 +39,19 @@ def test_sqrt_shapes(self, flow: TestFlow) -> None:
         # Test with different tensor shapes
 
         # 1D tensor
-        self._test_op(SqrtModel(), (torch.rand(20),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(20),), flow, generate_random_test_inputs=False
+        )
 
         # 2D tensor
-        self._test_op(SqrtModel(), (torch.rand(5, 10),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(5, 10),), flow, generate_random_test_inputs=False
+        )
 
         # 3D tensor
-        self._test_op(SqrtModel(), (torch.rand(3, 4, 5),), flow)
+        self._test_op(
+            SqrtModel(), (torch.rand(3, 4, 5),), flow, generate_random_test_inputs=False
+        )
 
     @unittest.skip("NaN and Inf are not enforced for backends.")
     def test_sqrt_edge_cases(self, flow: TestFlow) -> None:
diff --git a/backends/test/suite/operators/test_sub.py b/backends/test/suite/operators/test_sub.py
index be7b871fdad..2243eb6ee71 100644
--- a/backends/test/suite/operators/test_sub.py
+++ b/backends/test/suite/operators/test_sub.py
@@ -6,7 +6,6 @@
 
 # pyre-unsafe
 
-
 import torch
 from executorch.backends.test.suite.flow import TestFlow
 
diff --git a/backends/test/suite/reporting.py b/backends/test/suite/reporting.py
index cdf2ce870e1..09e950ab672 100644
--- a/backends/test/suite/reporting.py
+++ b/backends/test/suite/reporting.py
@@ -1,4 +1,5 @@
 import csv
+import json
 
 from collections import Counter
 from dataclasses import dataclass, field
@@ -343,7 +344,9 @@ def _sum_op_counts(counter: Counter | None) -> int | None:
 
 def _serialize_params(params: dict[str, Any] | None) -> str:
     if params is not None:
-        return str(dict(sorted(params.items())))
+        # Convert values to strings - JSON conversion doesn't like dtypes.
+        str_params = {k: str(v) for k, v in params.items()}
+        return json.dumps(str_params)
     else:
         return ""
 
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
index eeea09e0fc1..a6d7d07bce0 100644
--- a/backends/test/suite/runner.py
+++ b/backends/test/suite/runner.py
@@ -57,7 +57,7 @@ def _graph_has_unsupported_patterns(program: torch.export.ExportedProgram) -> bo
             and node.target == exir_ops.edge.aten.convolution.default
         ):
             in_rank = node.args[0].meta["val"].dim()
-            if in_rank != 4:
+            if in_rank > 4:
                 return True
 
     return False
diff --git a/backends/test/suite/tests/test_reporting.py b/backends/test/suite/tests/test_reporting.py
index 58ff76cba17..e42681fc678 100644
--- a/backends/test/suite/tests/test_reporting.py
+++ b/backends/test/suite/tests/test_reporting.py
@@ -1,3 +1,4 @@
+import json
 import unittest
 
 from csv import DictReader
@@ -102,14 +103,16 @@ def test_csv_report_simple(self):
         self.assertEqual(records[2]["Test Case"], "test2")
         self.assertEqual(records[2]["Flow"], "flow1")
         self.assertEqual(records[2]["Result"], "Pass")
-        self.assertEqual(records[2]["Params"], str({"dtype": torch.float32}))
+        self.assertEqual(records[2]["Params"], json.dumps({"dtype": "torch.float32"}))
 
         # Validate fourth record: test2, backend2, EXPORT_FAIL with use_dynamic_shapes param
         self.assertEqual(records[3]["Test ID"], "test2_backend2_flow1")
         self.assertEqual(records[3]["Test Case"], "test2")
         self.assertEqual(records[3]["Flow"], "flow1")
         self.assertEqual(records[3]["Result"], "Skip")
-        self.assertEqual(records[3]["Params"], str({"use_dynamic_shapes": True}))
+        self.assertEqual(
+            records[3]["Params"], json.dumps({"use_dynamic_shapes": "True"})
+        )
 
     def test_count_ops(self):
         """
diff --git a/backends/transforms/decompose_sdpa.py b/backends/transforms/decompose_sdpa.py
index d49e0da0c9b..6c36d1803fc 100644
--- a/backends/transforms/decompose_sdpa.py
+++ b/backends/transforms/decompose_sdpa.py
@@ -7,6 +7,7 @@
 # pyre-strict
 
 import math
+from typing import Set, Type
 
 import torch
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -19,6 +20,8 @@ class DecomposeScaledDotProductAttention(ExportPass):
     Decompose from scaled_dot_product_attention to multiple nodes.
     """
 
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def __init__(self, allow_non_fake_inputs: bool = True) -> None:
         super().__init__()
         # With allow_non_fake_inputs=False, we don't get _unsafe_view ops
diff --git a/backends/transforms/fuse_view_copy.py b/backends/transforms/fuse_view_copy.py
index c740515cdcc..1972513d2ef 100644
--- a/backends/transforms/fuse_view_copy.py
+++ b/backends/transforms/fuse_view_copy.py
@@ -7,6 +7,8 @@
 
 # pyre-strict
 
+from typing import Set, Type
+
 import torch
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -62,6 +64,8 @@ def remove_noop_view_copy(graph: torch.fx.Graph) -> tuple[torch.fx.Graph, bool]:
 
 
 class FuseViewCopyTransform(ExportPass):
+    _passes_required_after: Set[Type[ExportPass]] = set()
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         graph_module.graph, merge_modified = merge_view_copy_chains(graph_module.graph)
         graph_module.graph, noop_modified = remove_noop_view_copy(graph_module.graph)
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index 29ff90e7293..d9acde79ecf 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -105,17 +105,33 @@ target_include_directories(
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/third-party/flatbuffers/include>
 )
 
+# vulkan runtime utils files
+
+file(GLOB_RECURSE vulkan_runtime_utils_cpp ${RUNTIME_PATH}/utils/*.cpp)
+
 # vulkan_backend
 
+# Try to find boost to log stack traces when throwing exceptions
+find_package(Boost 1.89 COMPONENTS stacktrace_basic stacktrace_addr2line)
+
 file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
 list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
 list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
+list(APPEND vulkan_backend_cpp ${vulkan_runtime_utils_cpp})
 
 add_library(vulkan_backend ${vulkan_backend_cpp})
 target_include_directories(
   vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES}
 )
 target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core)
+# Optionally link boost for stacktraces if boost is available
+if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY)
+  target_link_libraries(
+    vulkan_backend PRIVATE ${Boost_STACKTRACE_LIBRARY}
+                           ${Boost_STACKTRACE_ADDR2LINE_LIBRARY}
+  )
+  list(APPEND VULKAN_CXX_FLAGS "-DETVK_BOOST_STACKTRACE_AVAILABLE")
+endif()
 target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS})
 # Link this library with --whole-archive due to dynamic backend registration
 executorch_target_link_options_shared_lib(vulkan_backend)
@@ -127,7 +143,7 @@ set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17)
 install(
   TARGETS vulkan_backend vulkan_schema
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${COMMON_INCLUDES}
 )
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
index e0a953d05fe..63a9b0b049a 100644
--- a/backends/vulkan/README.md
+++ b/backends/vulkan/README.md
@@ -150,7 +150,7 @@ when building with CMake.
 
 First, make sure that you have the Android NDK installed; any NDK version past
 NDK r19c should work. Note that the examples in this doc have been validated with
-NDK r27b. The Android SDK should also be installed so that you have access to `adb`.
+NDK r28c. The Android SDK should also be installed so that you have access to `adb`.
 
 The instructions in this page assumes that the following environment variables
 are set.
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
index aed41114ada..ae1a0b79654 100644
--- a/backends/vulkan/_passes/TARGETS
+++ b/backends/vulkan/_passes/TARGETS
@@ -117,6 +117,19 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "replace_qdq",
+    srcs = ["replace_qdq.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan:utils_lib",
+        "//executorch/exir:pass_base",
+    ],
+)
+
 runtime.python_library(
     name = "fuse_patterns",
     srcs = ["fuse_patterns.py"],
@@ -150,6 +163,7 @@ runtime.python_library(
         ":remove_asserts",
         ":remove_local_scalar_dense",
         ":remove_redundant_ops",
+        ":replace_qdq",
         ":squeeze_unsqueeze_inputs",
         ":tag_memory_meta_pass",
     ]
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
index f4ef6b2ac0e..169bd60543c 100644
--- a/backends/vulkan/_passes/__init__.py
+++ b/backends/vulkan/_passes/__init__.py
@@ -22,6 +22,7 @@
 from executorch.backends.vulkan._passes.remove_redundant_ops import (
     RemoveRedundantOpsTransform,
 )
+from executorch.backends.vulkan._passes.replace_qdq import ReplaceQDQPass
 from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import (
     SqueezeUnsqueezeInputs,
 )
@@ -36,6 +37,7 @@
     "RemoveAssertsTransform",
     "RemoveLocalScalarDenseOpsTransform",
     "RemoveRedundantOpsTransform",
+    "ReplaceQDQPass",
     "SqueezeUnsqueezeInputs",
     "TagMemoryMetaPass",
 ]
diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py
index 3beccc2205c..a6a5e751c05 100644
--- a/backends/vulkan/_passes/fold_qdq.py
+++ b/backends/vulkan/_passes/fold_qdq.py
@@ -17,9 +17,8 @@ class FoldQDQPass(ExportPass):
     valid quant op patterns have already been fused before this pass.
     """
 
-    def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(FoldQDQPass, self).__init__()
-        self.edge_program = edge_program
+    def __init__(self):
+        super().__init__()
 
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py
index 6ced1f32a7c..1575dd6a4f6 100644
--- a/backends/vulkan/_passes/fuse_patterns.py
+++ b/backends/vulkan/_passes/fuse_patterns.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Optional
+
 import executorch.backends.vulkan.patterns as vk_patterns
 
 import torch
@@ -13,13 +15,15 @@
 
 
 class FusePatternsPass(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.program = exported_program
+        self._exported_program: Optional[ExportedProgram] = None
 
     def call(self, graph_module: torch.fx.GraphModule):
+        assert self._exported_program is not None
+
         total_replaced = vk_patterns.replace_all_fusable_subgraphs(
-            self.program, graph_module
+            self._exported_program, graph_module
         )
 
         if total_replaced > 0:
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
index ca9f7541159..bb8cf5f2e64 100644
--- a/backends/vulkan/_passes/fuse_quantized_ops.py
+++ b/backends/vulkan/_passes/fuse_quantized_ops.py
@@ -211,18 +211,20 @@ def fuse_into_linear_qcnw_node(
 
 
 class FuseQuantizedOpsTransform(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
+    def __init__(self) -> None:
         super().__init__()
-        self.program = exported_program
+        self._exported_program: Optional[ExportedProgram] = None
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        assert self._exported_program is not None
+
         for node in graph_module.graph.nodes:
             # Check for linear_qcnw pattern (weight-only quantization)
-            qcnw_details = matches_linear_qcnw_pattern(self.program, node)
+            qcnw_details = matches_linear_qcnw_pattern(self._exported_program, node)
             if qcnw_details is not None:
                 qcnw_method, qcnw_nbits = qcnw_details
                 fuse_into_linear_qcnw_node(
-                    self.program, graph_module, node, qcnw_method, qcnw_nbits
+                    self._exported_program, graph_module, node, qcnw_method, qcnw_nbits
                 )
                 continue
 
diff --git a/backends/vulkan/_passes/replace_qdq.py b/backends/vulkan/_passes/replace_qdq.py
new file mode 100644
index 00000000000..fcfcdfc4c18
--- /dev/null
+++ b/backends/vulkan/_passes/replace_qdq.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import executorch.backends.vulkan.utils as utils
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ReplaceQDQPass(ExportPass):
+    """
+    Replace standard quantize/dequantize ops with custom conv-specific ops when they
+    feed into/from quantized convolution operations. This optimization allows the
+    backend to handle quantization more efficiently for convolution operations.
+    """
+
+    def __init__(self):
+        super(ReplaceQDQPass, self).__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        # Track nodes that need to be replaced
+        nodes_to_replace = []
+
+        for node in graph_module.graph.nodes:
+            # Check if this is the custom quantized conv2d op
+            if node.target in [
+                exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to.default,
+                exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default,
+                exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default,
+            ]:
+                # Replace quantize op feeding into conv2d (first argument is the quantized input)
+                quantized_input_node = node.args[0]
+                if isinstance(
+                    quantized_input_node, torch.fx.Node
+                ) and utils.is_quant_node(quantized_input_node):
+                    # Get the arguments from the original quantize node
+                    input_tensor = quantized_input_node.args[0]
+                    scale = quantized_input_node.args[1]
+                    zero_point = quantized_input_node.args[2]
+
+                    nodes_to_replace.append(
+                        {
+                            "old_node": quantized_input_node,
+                            "new_target": exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default,
+                            "args": (input_tensor, scale, zero_point),
+                            "node_type": "quantize_input",
+                        }
+                    )
+
+                # Find dequantize ops that consume the output of this conv2d
+                for user in node.users:
+                    if utils.is_dequant_node(user):
+                        # Get the arguments from the original dequantize node
+                        scale = user.args[1]
+                        zero_point = user.args[2]
+
+                        nodes_to_replace.append(
+                            {
+                                "old_node": user,
+                                "new_target": exir_ops.edge.et_vk.dequantize_q8to_from_conv2d.default,
+                                "args": (
+                                    node,
+                                    scale,
+                                    zero_point,
+                                ),  # node is the conv2d output
+                                "node_type": "dequantize_output",
+                            }
+                        )
+
+        # Apply the replacements
+        for replacement in nodes_to_replace:
+            old_node = replacement["old_node"]
+            new_target = replacement["new_target"]
+            new_args = replacement["args"]
+
+            with graph_module.graph.inserting_before(old_node):
+                new_node = graph_module.graph.create_node(
+                    "call_function", new_target, args=new_args
+                )
+                new_node.meta = old_node.meta.copy()
+                old_node.replace_all_uses_with(new_node)
+
+        # Clean up the graph
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+
+        # Re-trace to validate everything is ok
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
index db53cc666a8..8ed71aa1dae 100644
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ b/backends/vulkan/_passes/tag_memory_meta_pass.py
@@ -230,6 +230,10 @@ def get_arg_tensor_source_repset(
         """
         arg_node = op_node.args[arg_i]
 
+        # For non-tensor arguments, return ANY_STORAGE
+        if not utils.is_tensor_arg_node(arg_node):
+            return utils.ANY_STORAGE
+
         # Special case for cat - use the first tensor in the list as representative
         if isinstance(arg_node, list):
             arg_node = arg_node[0]
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
index 1b6838c4dfd..16a60abf6f3 100644
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ b/backends/vulkan/cmake/ShaderLibrary.cmake
@@ -24,22 +24,17 @@ if(NOT EXECUTORCH_ROOT)
   message("WARNING: EXECUTORCH_ROOT is not set! A failure is likely imminent.")
 endif()
 
-if(ANDROID)
-  if(NOT ANDROID_NDK)
-    message(FATAL_ERROR "ANDROID_NDK not set")
-  endif()
-
-  if(NOT GLSLC_PATH)
-    set(GLSLC_PATH
-        "${ANDROID_NDK}/shader-tools/${ANDROID_NDK_HOST_SYSTEM_NAME}/glslc"
-    )
-  endif()
-else()
-  find_program(GLSLC_PATH glslc PATHS $ENV{PATH})
+find_program(GLSLC_PATH glslc PATHS $ENV{PATH})
 
-  if(NOT GLSLC_PATH)
-    message(FATAL_ERROR "USE_VULKAN glslc not found")
-  endif()
+if(NOT GLSLC_PATH)
+  message(
+    FATAL_ERROR
+      "glslc from the Vulkan SDK must be installed to build the Vulkan backend. "
+      "Please install the Vulkan SDK 1.4.321.0 or newer from "
+      "https://vulkan.lunarg.com/sdk/home and ensure that the glslc binary is in your PATH. "
+      "Note that the glslc distributed with the Android NDK is not compatible since it "
+      "does not support the GL_EXT_integer_dot_product extension. "
+  )
 endif()
 
 # Required to enable linking with --whole-archive
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
index 56e803b9127..6e5aa926d37 100644
--- a/backends/vulkan/custom_ops_lib.py
+++ b/backends/vulkan/custom_ops_lib.py
@@ -354,18 +354,20 @@ def linear_q8ta_q8csw(
 lib.impl(name, linear_q8ta_q8csw, "CompositeExplicitAutograd")
 qa_q8csw_linear = getattr(getattr(torch.ops, namespace), name)
 
-#######################
-## conv2d_q8ta_q8csw ##
-#######################
+############################
+## conv2d_q8ta_q8csw_q8to ##
+############################
 
 
-def conv2d_q8ta_q8csw(
+def conv2d_q8ta_q8csw_q8to(
     x: torch.Tensor,
     input_scale: float,
     input_zero_point: int,
     weights: torch.Tensor,
     weight_sums: torch.Tensor,
     weight_scales: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
     bias: Optional[torch.Tensor],
     kernel_size: list,
     stride: list,
@@ -373,27 +375,103 @@ def conv2d_q8ta_q8csw(
     dilation: list,
     groups: int,
 ):
-    IC = x.shape[1]
+    x = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x, input_scale, input_zero_point, -128, 127, x.dtype
+    )
+
+    # Calculate weight dimensions
+    OC = weights.shape[0]
+    assert OC % groups == 0, "Output channels must be divisible by groups"
+    IC_per_group = int(x.shape[1] / groups)
     K_h, K_w = kernel_size[0], kernel_size[1]
 
-    canonical_weight_K_dim = K_h * K_w * IC
+    orig_weight_K_dim = K_h * K_w * IC_per_group
+    # Remove any padding added to in_features dim to align to a multiple of 4
+    if weights.shape[-1] > orig_weight_K_dim:
+        weights = weights[:, :orig_weight_K_dim]
+
     # Remove any padding added to output channels dim to align to a multiple of 4
-    if weights.shape[-1] != canonical_weight_K_dim:
-        weights = weights[:, :canonical_weight_K_dim]
-        weight_scales = weight_scales[:canonical_weight_K_dim]
+    if weight_scales.shape[0] > OC:
+        weight_scales = weight_scales[:OC]
         if bias is not None:
-            bias = bias[:canonical_weight_K_dim]
+            bias = bias[:OC]
+
+    # Reshape to original 4D format (OC, IC, H, W)
+    weights = weights.view(OC, IC_per_group, K_h, K_w)
 
     weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32)
+    # Dequantize weights
+    weights = torch.ops.quantized_decomposed.dequantize_per_channel(
+        weights,
+        weight_scales,
+        weight_zeros,
+        0,  # axis=0 for output channel quantization
+        -127,
+        127,
+        torch.int8,
+    )
 
-    # Calculate dimensions
-    OC = weights.shape[0]
-    in_features = weights.shape[1]
-    IC = in_features // (K_h * K_w)
+    # Perform convolution
+    out = torch.nn.functional.conv2d(
+        x, weights, bias, stride, padding, dilation, groups
+    )
 
-    # Reshape to original 4D format (OC, IC, H, W)
-    weights = weights.view(OC, IC, K_h, K_w)
+    out = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out, output_scale, output_zero_point, -128, 127, torch.int8
+    )
 
+    return out
+
+
+name = "conv2d_q8ta_q8csw_q8to"
+lib.define(
+    f"""
+    {name}(
+        Tensor x,
+        float input_scale,
+        int input_zero_point,
+        Tensor weights,
+        Tensor weight_sums,
+        Tensor weight_scales,
+        float output_scale,
+        int output_zero_point,
+        Tensor? bias,
+        SymInt[] kernel_size,
+        SymInt[] stride,
+        SymInt[] padding,
+        SymInt[] dilation,
+        SymInt groups) -> Tensor
+    """
+)
+lib.impl(name, conv2d_q8ta_q8csw_q8to, "CompositeExplicitAutograd")
+conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name)
+
+
+def conv2d_q8ta_q8csw_q8to_dw(
+    x: torch.Tensor,
+    input_scale: float,
+    input_zero_point: int,
+    weights: torch.Tensor,
+    weight_sums: torch.Tensor,
+    weight_scales: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    bias: Optional[torch.Tensor],
+    kernel_size: list,
+    stride: list,
+    padding: list,
+    dilation: list,
+    groups: int,
+):
+    x = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x, input_scale, input_zero_point, -128, 127, x.dtype
+    )
+
+    # Restore weight to original data layout
+    K_h, K_w, OC = weights.shape
+    weights = weights.permute(2, 0, 1).reshape(OC, 1, K_h, K_w)
+
+    weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32)
     # Dequantize weights
     weights = torch.ops.quantized_decomposed.dequantize_per_channel(
         weights,
@@ -410,10 +488,14 @@ def conv2d_q8ta_q8csw(
         x, weights, bias, stride, padding, dilation, groups
     )
 
+    out = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out, output_scale, output_zero_point, -128, 127, torch.int8
+    )
+
     return out
 
 
-name = "conv2d_q8ta_q8csw"
+name = "conv2d_q8ta_q8csw_q8to_dw"
 lib.define(
     f"""
     {name}(
@@ -423,6 +505,8 @@ def conv2d_q8ta_q8csw(
         Tensor weights,
         Tensor weight_sums,
         Tensor weight_scales,
+        float output_scale,
+        int output_zero_point,
         Tensor? bias,
         SymInt[] kernel_size,
         SymInt[] stride,
@@ -431,8 +515,8 @@ def conv2d_q8ta_q8csw(
         SymInt groups) -> Tensor
     """
 )
-lib.impl(name, conv2d_q8ta_q8csw, "CompositeExplicitAutograd")
-conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name)
+lib.impl(name, conv2d_q8ta_q8csw_q8to_dw, "CompositeExplicitAutograd")
+conv2d_q8ta_q8csw_dw_op = getattr(getattr(torch.ops, namespace), name)
 
 ######################
 ## apply_rotary_emb ##
@@ -452,3 +536,81 @@ def apply_rotary_emb_impl(
 )
 lib.impl(name, apply_rotary_emb_impl, "CompositeExplicitAutograd")
 apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)
+
+#############################
+## quantize/dequantize ops ##
+#############################
+
+
+def quantize_q8ta_for_conv2d_impl(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+):
+    return torch.ops.quantized_decomposed.quantize_per_tensor(
+        input, scale, zero_point, -128, 127, torch.int8
+    )
+
+
+name = "quantize_q8ta_for_conv2d"
+lib.define(f"{name}(Tensor input, float scale, int zero_point) -> Tensor")
+lib.impl(name, quantize_q8ta_for_conv2d_impl, "CompositeExplicitAutograd")
+quantize_q8ta_for_conv2d_op = getattr(getattr(torch.ops, namespace), name)
+
+
+def dequantize_q8to_from_conv2d_impl(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+):
+    return torch.ops.quantized_decomposed.dequantize_per_tensor(
+        input, scale, zero_point, -128, 127, input.dtype
+    )
+
+
+name = "dequantize_q8to_from_conv2d"
+lib.define(f"{name}(Tensor input, float scale, int zero_point) -> Tensor")
+lib.impl(name, dequantize_q8to_from_conv2d_impl, "CompositeExplicitAutograd")
+dequantize_q8to_from_conv2d_op = getattr(getattr(torch.ops, namespace), name)
+
+########################
+## add_q8ta_q8ta_q8to ##
+########################
+
+
+def add_q8ta_q8ta_q8to_impl(
+    input_a: torch.Tensor,
+    input_b: torch.Tensor,
+    input_a_scale: float,
+    input_a_zero_point: int,
+    input_b_scale: float,
+    input_b_zero_point: int,
+    output_scale: float,
+    output_zero_point: int,
+    alpha: float,
+):
+    # Dequantize inputs to float
+    dequant_a = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        input_a, input_a_scale, input_a_zero_point, -128, 127, input_a.dtype
+    )
+    dequant_b = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        input_b, input_b_scale, input_b_zero_point, -128, 127, input_b.dtype
+    )
+
+    # Perform addition with alpha scaling
+    result = dequant_a + alpha * dequant_b
+
+    # Quantize the result back to int8
+    quantized_result = torch.ops.quantized_decomposed.quantize_per_tensor(
+        result, output_scale, output_zero_point, -128, 127, torch.int8
+    )
+
+    return quantized_result
+
+
+name = "add_q8ta_q8ta_q8to"
+lib.define(
+    f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor"
+)
+lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd")
+add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name)
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 4c686e0cfc5..63b57a0e79c 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -16,8 +16,6 @@
 
 import torch
 
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout
-
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
@@ -48,6 +46,9 @@ class OpFeatures:
         # Optional check function used during partitioning to determine if a node's
         # inputs are supported by the operator implementation.
         "are_node_inputs_supported_fn",
+        # Optional function to determine valid representation sets for input and outputs
+        # once a node's actual inputs are known.
+        "pick_io_storage_fn",
     ]
 
     def __init__(
@@ -61,6 +62,7 @@ def __init__(
         supports_resize: bool = False,
         supports_prepacking: bool = False,
         are_node_inputs_supported_fn: Optional[Callable] = allow_node,
+        pick_io_storage_fn: Optional[Callable] = None,
     ):
         self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList(
             inputs_storage if inputs_storage is not None else []
@@ -77,15 +79,21 @@ def __init__(
         self.supports_prepacking = supports_prepacking
 
         self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
+        self.pick_io_storage_fn = pick_io_storage_fn
 
     def make_op_repsets(
         self,
         op_node: torch.fx.Node,
         texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS,
     ) -> utils.OpRepSets:
-        return utils.OpRepSets(
-            self.inputs_storage, self.outputs_storage, op_node, texture_limits
-        )
+        inputs_storage = self.inputs_storage
+        outputs_storage = self.outputs_storage
+        if self.pick_io_storage_fn is not None:
+            i_storage, o_storage = self.pick_io_storage_fn(op_node)
+            inputs_storage = utils.TensorRepSetList(i_storage)
+            outputs_storage = utils.TensorRepSetList(o_storage)
+
+        return utils.OpRepSets(inputs_storage, outputs_storage, op_node, texture_limits)
 
 
 #######################
@@ -410,28 +418,16 @@ def register_softmax_op():
 )
 def register_reduce_op():
     def check_reduce_node(node: torch.fx.Node) -> bool:
+        # Only one argument implies that the reduction is over the entire tensor, which
+        # is not supported yet.
+        if len(node.args) == 1:
+            return False
+
         dim_list = node.args[1]
+        # Only 1D and 2D reductions are supported at the moment.
         if isinstance(dim_list, list) and len(dim_list) > 2:
             return False
 
-        if isinstance(dim_list, list) and len(dim_list) == 2:
-            # Try to get the memory layout for this node
-            try:
-                memory_layout = utils.get_node_memory_layout(node)
-
-                # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension
-                if (
-                    memory_layout is not None
-                    and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT
-                ):
-                    # For now only default layout is supported for 2D reduction.
-                    # Because we can't determine if the input is NCHW or NHWC here,
-                    # assume the reduction dimension is packed so we cannot support it.
-                    return False
-            except (AssertionError, KeyError, AttributeError):
-                # If we can't get memory layout information, we'll assume the dims aren't packed
-                pass
-
         def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
             for arg in node.args:
                 if isinstance(arg, bool):
@@ -446,10 +442,41 @@ def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
 
         return True
 
+    def pick_io_storage_for_reduce(node: torch.fx.Node):
+        inputs_storage = utils.ANY_TEXTURE
+        outputs_storage = utils.ANY_TEXTURE
+
+        input_tensor = node.args[0]
+        ndim = input_tensor.meta["val"].ndim
+        dim_list = node.args[1]
+        if isinstance(dim_list, list) and len(dim_list) == 2:
+            reduce_dim1_whcn = utils.nchw_dim_to_whcn_dim(dim_list[0], ndim)
+            reduce_dim2_whcn = utils.nchw_dim_to_whcn_dim(dim_list[1], ndim)
+
+            possible_packed_dims = {0, 1, 2}
+            possible_packed_dims.discard(reduce_dim1_whcn)
+            possible_packed_dims.discard(reduce_dim2_whcn)
+
+            packed_dim = possible_packed_dims.pop()
+            assert packed_dim in [0, 1, 2]
+
+            if packed_dim == 0:
+                inputs_storage = utils.WIDTH_PACKED_TEXTURE
+                outputs_storage = utils.WIDTH_PACKED_TEXTURE
+            elif packed_dim == 1:
+                inputs_storage = utils.HEIGHT_PACKED_TEXTURE
+                outputs_storage = utils.HEIGHT_PACKED_TEXTURE
+            else:
+                inputs_storage = utils.CHANNELS_PACKED_TEXTURE
+                outputs_storage = utils.CHANNELS_PACKED_TEXTURE
+
+        return inputs_storage, outputs_storage
+
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         supports_resize=True,
         are_node_inputs_supported_fn=check_reduce_node,
+        pick_io_storage_fn=pick_io_storage_for_reduce,
     )
 
 
@@ -474,6 +501,23 @@ def register_2d_pool_op():
     ]
 )
 def register_convolution_op():
+    def check_conv_node(node: torch.fx.Node) -> bool:
+        x = node.args[0]
+        x_shape = x.meta["val"].size()
+        # 4-D input implies 2D convolution
+        if len(x_shape) == 4:
+            batches = x.meta["val"].size()[0]
+            if batches != 1:
+                return False
+        # 3-D input implies 1D convolution
+        if len(x_shape) == 3:
+            transpose = node.args[6]
+            # Transposed 1D convolution is not supported yet
+            if transpose:
+                return False
+
+        return True
+
     return OpFeatures(
         inputs_storage=[
             utils.CHANNELS_PACKED_TEXTURE,  # input
@@ -490,23 +534,27 @@ def register_convolution_op():
         ],
         supports_resize=True,
         supports_prepacking=True,
+        are_node_inputs_supported_fn=check_conv_node,
     )
 
 
 @update_features(
     [
-        exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default,
+        exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to.default,
+        exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default,
     ]
 )
 def register_quantized_conv_op():
     return OpFeatures(
         inputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,  # input
+            utils.PACKED_INT8_4W4C_BUFFER,  # input
             utils.NO_STORAGE,  # input_scale (non tensor)
             utils.NO_STORAGE,  # input_zero_point (non tensor)
             utils.NO_STORAGE,  # weight (prepacked)
             utils.NO_STORAGE,  # weight_sums (prepacked)
             utils.NO_STORAGE,  # weight_scales (prepacked)
+            utils.NO_STORAGE,  # output_scale (non tensor)
+            utils.NO_STORAGE,  # output_zero_point (non tensor)
             utils.NO_STORAGE,  # bias (prepacked)
             utils.NO_STORAGE,  # kernel_size (non tensor)
             utils.NO_STORAGE,  # stride (non tensor)
@@ -520,6 +568,53 @@ def register_quantized_conv_op():
     )
 
 
+@update_features(
+    [
+        exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default,
+    ]
+)
+def register_quantized_binary_op():
+    return OpFeatures(
+        inputs_storage=utils.PACKED_INT8_4W4C_BUFFER,
+        supports_resize=False,
+        supports_prepacking=True,
+    )
+
+
+@update_features(
+    [
+        exir_ops.edge.et_vk.quantize_q8ta_for_conv2d.default,
+    ]
+)
+def register_quantize_for_conv2d_op():
+    return OpFeatures(
+        inputs_storage=[
+            utils.CHANNELS_PACKED_TEXTURE,
+        ],
+        outputs_storage=[
+            utils.PACKED_INT8_4W4C_BUFFER,
+        ],
+        supports_resize=False,
+    )
+
+
+@update_features(
+    [
+        exir_ops.edge.et_vk.dequantize_q8to_from_conv2d.default,
+    ]
+)
+def register_dequantize_for_conv2d_op():
+    return OpFeatures(
+        inputs_storage=[
+            utils.PACKED_INT8_4W4C_BUFFER,
+        ],
+        outputs_storage=[
+            utils.CHANNELS_PACKED_TEXTURE,
+        ],
+        supports_resize=False,
+    )
+
+
 @update_features("llama::sdpa_with_kv_cache")
 def register_sdpa_with_kv_cache_op():
     return OpFeatures(
@@ -666,6 +761,7 @@ def register_ported_ops_with_prepacking():
     return OpFeatures(
         inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
         supports_prepacking=True,
+        supports_resize=True,
     )
 
 
@@ -696,6 +792,7 @@ def register_ported_ops_with_prepacking_all_dims():
     return OpFeatures(
         inputs_storage=utils.ANY_TEXTURE,
         supports_prepacking=True,
+        supports_resize=True,
     )
 
 
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index e5b2d0f7864..0bdc16616ef 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -36,7 +36,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch.export.exported_program import ExportedProgram
@@ -254,9 +254,10 @@ def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
             self.log_skip(node, "permute node of non compatible linear node")
             return False
 
-        is_in_local_scalar_dense_chain, dst_node_is_compatible = (
-            self.is_in_local_scalar_dense_chain(node)
-        )
+        (
+            is_in_local_scalar_dense_chain,
+            dst_node_is_compatible,
+        ) = self.is_in_local_scalar_dense_chain(node)
         if is_in_local_scalar_dense_chain and dst_node_is_compatible:
             return True
         elif is_in_local_scalar_dense_chain:
@@ -419,6 +420,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
 
         tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
diff --git a/backends/vulkan/patterns/TARGETS b/backends/vulkan/patterns/TARGETS
index 791edf58984..285efe2b933 100644
--- a/backends/vulkan/patterns/TARGETS
+++ b/backends/vulkan/patterns/TARGETS
@@ -11,6 +11,7 @@ runtime.python_library(
         "rope.py",
         "quantized_linear.py",
         "quantized_convolution.py",
+        "quantized_binary.py",
     ],
     visibility = [
         "//executorch/backends/...",
diff --git a/backends/vulkan/patterns/__init__.py b/backends/vulkan/patterns/__init__.py
index 8ffad98b3c3..e23dfc7629c 100644
--- a/backends/vulkan/patterns/__init__.py
+++ b/backends/vulkan/patterns/__init__.py
@@ -6,6 +6,8 @@
 
 from typing import List
 
+import executorch.backends.vulkan.patterns.quantized_binary  # noqa
+
 import executorch.backends.vulkan.patterns.quantized_convolution  # noqa
 
 import executorch.backends.vulkan.patterns.quantized_linear  # noqa
diff --git a/backends/vulkan/patterns/quantized_binary.py b/backends/vulkan/patterns/quantized_binary.py
new file mode 100644
index 00000000000..da4985b931d
--- /dev/null
+++ b/backends/vulkan/patterns/quantized_binary.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import executorch.backends.vulkan.utils as utils
+
+import torch
+
+from executorch.backends.vulkan.patterns.pattern_registry import (
+    PatternMatch,
+    register_pattern_detector,
+    register_pattern_replacement,
+)
+
+from executorch.exir import ExportedProgram
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+class QuantizedBinaryMatch(PatternMatch):
+    def __init__(self, binary_node: torch.fx.Node) -> None:
+        self.anchor_node = binary_node
+        self.match_found = False
+        self.all_nodes = [self.anchor_node]
+
+        # Extract alpha parameter if it exists (for add operations)
+        self.alpha = 1.0
+        if len(binary_node.args) > 2 and binary_node.args[2] is not None:
+            # Alpha is typically a scalar value
+            if isinstance(binary_node.args[2], (int, float)):
+                self.alpha = binary_node.args[2]
+
+        # Identify input nodes - both should be dequantize nodes for static quantization
+        if len(binary_node.args) < 2:
+            return
+
+        input_a_node = binary_node.args[0]
+        assert isinstance(input_a_node, torch.fx.Node)
+        input_b_node = binary_node.args[1]
+        assert isinstance(input_b_node, torch.fx.Node)
+
+        # Both arguments must be dequant nodes for static quantization
+        if not utils.is_dequant_node(input_a_node) or not utils.is_dequant_node(
+            input_b_node
+        ):
+            return
+
+        self.dequantize_input_a_node = input_a_node
+        self.dequantize_input_b_node = input_b_node
+
+        # Extract quantization parameters for input A
+        self.quantize_input_a_node = self.dequantize_input_a_node.args[0]
+        self.input_a_scales_node = self.dequantize_input_a_node.args[1]
+        self.input_a_zeros_node = self.dequantize_input_a_node.args[2]
+
+        # Extract quantization parameters for input B
+        self.quantize_input_b_node = self.dequantize_input_b_node.args[0]
+        self.input_b_scales_node = self.dequantize_input_b_node.args[1]
+        self.input_b_zeros_node = self.dequantize_input_b_node.args[2]
+
+        self.all_nodes.extend(
+            [self.dequantize_input_a_node, self.dequantize_input_b_node]
+        )
+
+        # Identify output node
+        self.output_node = self.anchor_node
+
+        # The binary operation output must have only one user; it will be either a relu node
+        # or a quantize node.
+        if len(self.output_node.users) != 1:
+            return
+
+        cur_node = list(self.output_node.users)[0]
+        self.relu_node = None
+        if cur_node.target == exir_ops.edge.aten.relu.default:
+            self.relu_node = cur_node
+            self.all_nodes.append(self.relu_node)
+            # If there's a relu, get its user (should be the quantize node)
+            if len(cur_node.users) != 1:
+                return
+            cur_node = list(cur_node.users)[0]
+
+        if not utils.is_quant_node(cur_node):
+            return
+
+        self.quantize_output_node = cur_node
+        self.output_scales_node = self.quantize_output_node.args[1]
+        self.output_zeros_node = self.quantize_output_node.args[2]
+
+        self.all_nodes.append(self.quantize_output_node)
+
+        self.match_found = True
+
+
+# Define the binary operation anchor nodes that we support
+binary_anchor_nodes = {
+    exir_ops.edge.aten.add.Tensor,
+    exir_ops.edge.aten.add_.Tensor,
+}
+
+
+@register_pattern_detector("quantized_binary")
+def find_quantized_binary_patterns(
+    node: torch.fx.Node,
+) -> Optional[QuantizedBinaryMatch]:
+    if node.target not in binary_anchor_nodes:
+        return None
+
+    matched_pattern = QuantizedBinaryMatch(node)
+    if matched_pattern.match_found:
+        return matched_pattern
+
+    return None
+
+
+##
+## Pattern Replacement
+##
+
+
+@register_pattern_replacement("quantized_binary")
+def make_add_q8ta_q8ta_q8to_custom_op(
+    ep: ExportedProgram,
+    graph_module: torch.fx.GraphModule,
+    match: QuantizedBinaryMatch,
+):
+    # Determine the operation type based on the anchor node
+    op_target = None
+    if match.anchor_node.target in {
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.add_.Tensor,
+    }:
+        op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default
+    else:
+        # For future binary operations, add more mappings here
+        raise NotImplementedError(
+            f"Unsupported binary operation: {match.anchor_node.target}"
+        )
+
+    with graph_module.graph.inserting_before(match.output_node):
+        qbinary_node = graph_module.graph.create_node(
+            "call_function",
+            op_target,
+            args=(
+                match.quantize_input_a_node,
+                match.quantize_input_b_node,
+                match.input_a_scales_node,
+                match.input_a_zeros_node,
+                match.input_b_scales_node,
+                match.input_b_zeros_node,
+                match.output_scales_node,
+                match.output_zeros_node,
+                match.alpha,  # Alpha parameter for scaling
+            ),
+        )
+
+    qbinary_node.meta["val"] = match.output_node.meta["val"]
+    match.quantize_output_node.replace_all_uses_with(qbinary_node)
diff --git a/backends/vulkan/patterns/quantized_convolution.py b/backends/vulkan/patterns/quantized_convolution.py
index 65b51b5e103..522a19c58d6 100644
--- a/backends/vulkan/patterns/quantized_convolution.py
+++ b/backends/vulkan/patterns/quantized_convolution.py
@@ -76,11 +76,13 @@ def __init__(self, conv_node: torch.fx.Node) -> None:
         # Identify output node
         self.output_node = self.anchor_node
 
-        out_channels = self.output_node.meta["val"].shape[-1]
-        # The implementation requires that for grouped convolutions, a group does not
-        # cross any texel boundary. The output channels per group must be a multiple of
-        # 4. If this is not true, then don't match the pattern.
-        if self.groups > 1 and (out_channels / self.groups) % 4 == 0:
+        out_channels = self.output_node.meta["val"].shape[-3]
+        # The implementation requires that for non-depthwise grouped convolutions, a
+        # group does not cross the texel boundary. The output channels per group must be
+        # a multiple of 4. If this is not true, then don't match the pattern.
+        if (self.groups > 1 and self.groups < out_channels) and (
+            out_channels / self.groups
+        ) % 4 != 0:
             return
 
         # Identify bias node, if applicable
@@ -93,23 +95,37 @@ def __init__(self, conv_node: torch.fx.Node) -> None:
                 self.all_nodes.extend(arg_chain)
 
         # Identify input node
-        self.fp_input_node, self.quantize_input_node, dq_node = (
-            utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
-        )
-        assert self.fp_input_node is not None
-        self.all_nodes.append(self.fp_input_node)
-        assert self.quantize_input_node is not None
-        assert dq_node is not None
-
-        self.input_scales_node = self.quantize_input_node.args[1]
-        self.input_zeros_node = self.quantize_input_node.args[2]
-
-        self.all_nodes.extend(
-            [
-                self.quantize_input_node,
-                dq_node,
-            ]
-        )
+        primary_input_node = self.anchor_node.args[0]
+        assert isinstance(primary_input_node, torch.fx.Node)
+        # Argument must be a dequant node for static quantization
+        if not utils.is_dequant_node(primary_input_node):
+            return
+
+        self.dequantize_input_node = primary_input_node
+        self.quantize_input_node = self.dequantize_input_node.args[0]
+
+        self.input_scales_node = self.dequantize_input_node.args[1]
+        self.input_zeros_node = self.dequantize_input_node.args[2]
+
+        self.all_nodes.extend([self.dequantize_input_node])
+
+        # The convolution output must have only one user; it will be either a relu node
+        # or a dequantize node.
+        if len(self.output_node.users) != 1:
+            return
+
+        cur_node = list(self.output_node.users)[0]
+        self.relu_node = None
+        if cur_node.target == exir_ops.edge.aten.relu.default:
+            self.relu_node = cur_node
+            cur_node = list(cur_node.users)[0]
+
+        if not utils.is_quant_node(cur_node):
+            return
+
+        self.quantize_output_node = cur_node
+        self.output_scales_node = self.quantize_output_node.args[1]
+        self.output_zeros_node = self.quantize_output_node.args[2]
 
         self.match_found = True
 
@@ -161,13 +177,26 @@ def make_conv2d_q8ta_q8csw_custom_op(
         bias_tensor = get_param_tensor(ep, match.bias_node)
         assert bias_tensor is not None
 
-    OC, IC, H, W = weight_tensor.shape
+    OC, IC_per_group, H, W = weight_tensor.shape
 
-    # Reshape weight tensor from (OC, IC, H, W) to (OC, H * W * IC) (i.e. matrix format)
-    # This prepares the weights for Im2Col-based convolution
-    weight_tensor = (
-        weight_tensor.permute(0, 2, 3, 1).contiguous().view(OC, H * W * IC).contiguous()
-    )
+    is_depthwise_conv = IC_per_group == 1 and match.groups == OC
+
+    if is_depthwise_conv:
+        assert OC % 4 == 0, "depthwise conv requires that OC is divisible by 4"
+        # Depthwise convs use a specialized layout; the weight tensor is reshaped to
+        # (H, W, OC)
+        weight_tensor = (
+            weight_tensor.permute(2, 3, 1, 0).contiguous().view(H, W, OC).contiguous()
+        )
+    else:
+        # Reshape weight tensor from (OC, IC_per_group, H, W) to (OC, H * W * IC_per_group)
+        # (i.e. matrix format). This prepares the weights for Im2Col-based convolution.
+        weight_tensor = (
+            weight_tensor.permute(0, 2, 3, 1)
+            .contiguous()
+            .view(OC, H * W * IC_per_group)
+            .contiguous()
+        )
 
     # Need to make sure that OC dim is a multiple of 4 so that data load/stores are well
     # aligned with texel boundaries. Add padding to align to the next multiple of 4 if
@@ -178,6 +207,7 @@ def make_conv2d_q8ta_q8csw_custom_op(
     utils.align_width_and_update_state_dict(
         ep, match.weight_scales_node, weight_scales_tensor
     )
+
     if bias_tensor is not None:
         utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
 
@@ -185,7 +215,7 @@ def make_conv2d_q8ta_q8csw_custom_op(
     with graph_module.graph.inserting_before(first_graph_node):
         qweight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
         # Pre-compute the weight sums which are needed to apply activation zero point
-        # when using integer accumulation. For the reshaped 2D weight matrix (IC * H * W, OC),
+        # when using integer accumulation. For the reshaped 2D weight matrix (IC_per_group * H * W, OC),
         # sum over dimension 0 to get sums per output channel
         sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous()
         sums_name = qweight_tensor_name + "_sums"
@@ -201,16 +231,22 @@ def make_conv2d_q8ta_q8csw_custom_op(
         )
 
     with graph_module.graph.inserting_before(match.output_node):
+        op_target = exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to.default
+        if is_depthwise_conv:
+            op_target = exir_ops.edge.et_vk.conv2d_q8ta_q8csw_q8to_dw.default
+
         qconv_node = graph_module.graph.create_node(
             "call_function",
-            exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default,
+            op_target,
             args=(
-                match.fp_input_node,
+                match.quantize_input_node,
                 match.input_scales_node,
                 match.input_zeros_node,
                 match.weight_node,
                 weight_sums_node,
                 match.weight_scales_node,
+                match.output_scales_node,
+                match.output_zeros_node,
                 match.bias_node,  # Add bias after weight_scales
                 [H, W],  # Pass kernel size information before stride
                 match.stride,
@@ -221,4 +257,4 @@ def make_conv2d_q8ta_q8csw_custom_op(
         )
 
     qconv_node.meta["val"] = match.output_node.meta["val"]
-    match.output_node.replace_all_uses_with(qconv_node)
+    match.quantize_output_node.replace_all_uses_with(qconv_node)
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
index 882d0d41e6d..374e29c634d 100644
--- a/backends/vulkan/patterns/quantized_linear.py
+++ b/backends/vulkan/patterns/quantized_linear.py
@@ -92,9 +92,11 @@ def __init__(self, mm_node: torch.fx.Node) -> None:
             return
 
         # Identify input node
-        self.fp_input_node, self.quantize_input_node, dq_node = (
-            utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
-        )
+        (
+            self.fp_input_node,
+            self.quantize_input_node,
+            dq_node,
+        ) = utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
         assert self.fp_input_node is not None
         self.all_nodes.append(self.fp_input_node)
 
@@ -386,7 +388,7 @@ def make_linear_dq8ca_q4gsw_op(
         weight_sums_node = create_constant_placeholder(
             exp_program=ep,
             graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
+            kind=InputKind.PARAMETER,
             name=sums_name,
             data=sum_per_quant_group,
         )
@@ -429,7 +431,7 @@ def make_linear_q8ta_q8csw_custom_op(
         weight_sums_node = create_constant_placeholder(
             exp_program=ep,
             graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
+            kind=InputKind.PARAMETER,
             name=sums_name,
             data=sum_per_output_channel,
         )
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index 67b646ae1a8..fe8cc83c481 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -139,6 +139,10 @@ utils::GPUMemoryLayout get_memory_layout(
       return utils::kHeightPacked;
     case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED:
       return utils::kChannelsPacked;
+    case vkgraph::VkMemoryLayout::PACKED_INT8_4W4C:
+      return utils::kPackedInt8_4W4C;
+    case vkgraph::VkMemoryLayout::PACKED_INT8_4H4W:
+      return utils::kPackedInt8_4H4W;
     default:
       break;
   }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 433ae15db4e..d798b203673 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -14,6 +14,21 @@
 namespace vkcompute {
 namespace api {
 
+/*
+ * For PackedInt8 memory layouts, ensure that the scalar type used for the
+ * tensor is kInt8x4. Otherwise, return the original scalar type.
+ */
+vkapi::ScalarType get_effective_scalar_type(
+    const vkapi::ScalarType dtype,
+    const utils::GPUMemoryLayout memory_layout) {
+  vkapi::ScalarType effective_dtype = dtype;
+  if (utils::is_packed_int8_layout(memory_layout)) {
+    VK_CHECK_COND(dtype == vkapi::kInt8x4 || dtype == vkapi::kChar);
+    effective_dtype = vkapi::kInt8x4;
+  }
+  return effective_dtype;
+}
+
 /*
  * Used to infer the sizes of a tensor that would correspond to a given
  * VulkanImage.
@@ -187,6 +202,7 @@ std::vector<int64_t> calculate_padded_sizes(
 
 utils::uvec3 calculate_image_extents(
     const std::vector<int64_t>& padded_sizes,
+    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim) {
   utils::uvec3 extents({1, 1, 1});
@@ -205,6 +221,28 @@ utils::uvec3 calculate_image_extents(
     extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
   }
 
+  // For "regular" tensor dtypes, 4 elements along the packed dim are packed
+  // into one texel (4-component vectorized type). However, for packed int8
+  // memory layouts, an additional level of packing is employed where 4 int8
+  // elements are packed into one int32, and then 4 int32 are packed into each
+  // ivec4 texel.
+  if (utils::is_packed_int8_layout(memory_layout)) {
+    // Each int in the ivec4 contains 4 channels. The overall ivec4 contains
+    // data for a 1Hx4Wx4C block of the input tensor.
+    if (memory_layout == utils::kPackedInt8_4W4C) {
+      VK_CHECK_COND(packed_dim == 2);
+      extents[axis_map.at(0)] = utils::div_up(extents[axis_map.at(0)], 4u);
+    }
+    // Each int in the ivec4 contains 4 elements along the width dim. The
+    // overall ivec4 contains data for a 4Hx4W block of the input tensor.
+    else if (memory_layout == utils::kPackedInt8_4H4W) {
+      VK_CHECK_COND(packed_dim == 0);
+      extents[axis_map.at(1)] = utils::div_up(extents[axis_map.at(1)], 4u);
+    } else {
+      VK_THROW("Unhandled packed int8 memory layout!");
+    }
+  }
+
   // axis_map[3] indicates the WHCN index of the dimension used for batch
   // concatenation. Thus a double lookup is required to determine the image axis
   // used for batch concatenation.
@@ -215,6 +253,7 @@ utils::uvec3 calculate_image_extents(
 
   VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
   extents[axis_map.at(packed_dim)] /= 4;
+
   return extents;
 }
 
@@ -247,35 +286,72 @@ utils::uvec3 calculate_logical_limits(
  */
 utils::uvec3 calculate_logical_limits(
     const std::vector<int64_t>& sizes,
+    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim) {
   return calculate_logical_limits(
       calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
+          calculate_padded_sizes(sizes, packed_dim),
+          memory_layout,
+          axis_map,
+          packed_dim),
       axis_map);
 }
 
 int64_t calculate_gpu_buffer_numel(
+    const std::vector<int64_t>& sizes,
+    const utils::GPUMemoryLayout memory_layout,
+    const vkapi::ScalarType dtype) {
+  size_t numel;
+
+  // Mirrors the logic in calculate_image_extents for packed int8 memory layouts
+  if (dtype == vkapi::kInt8x4) {
+    VK_CHECK_COND(utils::is_packed_int8_layout(memory_layout));
+    std::vector<int64_t> blocks_in_dim =
+        flip_and_unsqueeze<int64_t>(sizes, kTensorSizes, 0);
+    // Each ivec4 contains data for a 1Hx4Wx4C block of the input
+    if (memory_layout == utils::kPackedInt8_4W4C) {
+      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
+      blocks_in_dim[2] = utils::div_up_4(blocks_in_dim[2]);
+    }
+    // Each ivec4 contains data for a 4Hx4W block of the input
+    else if (memory_layout == utils::kPackedInt8_4H4W) {
+      blocks_in_dim[0] = utils::div_up_4(blocks_in_dim[0]);
+      blocks_in_dim[1] = utils::div_up_4(blocks_in_dim[1]);
+    } else {
+      VK_THROW("Unhandled packed int8 memory layout!");
+    }
+    // Each block is represented as an ivec4, and the base dtype of the buffer
+    // is int. Therefore, need to multiply the number of blocks by 4 to obtain
+    // the number of int elements in the data buffer.
+    numel = utils::multiply_integers(blocks_in_dim) * 4;
+  }
+  // Case for "regular" dtypes/memory layouts
+  else {
+    numel = utils::multiply_integers(sizes);
+
+    // For 8-bit types, align to the next multiple of 4. For devices that do not
+    // support 8-bit storage buffers, the tensor data will be interpreted as an
+    // array of int32 instead.
+    if (vkapi::element_size(dtype) == 1) {
+      numel = utils::align_up_4(numel);
+    }
+  }
+  return numel;
+}
+
+int64_t calculate_staging_or_gpu_buffer_numel(
     Context* const context,
     const std::vector<int64_t>& sizes,
     const utils::uvec3 image_extents,
     const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout,
     const vkapi::ScalarType dtype) {
   // For texture backed tensors, simply multiply the total number of texels by 4
   if (storage_type != utils::kBuffer) {
     return image_extents[0] * image_extents[1] * image_extents[2] * 4;
   }
-  const bool is_int8 = dtype == vkapi::kChar;
-  const bool int8_supported =
-      context->adapter_ptr()->has_full_int8_buffers_support();
-  const size_t numel = utils::multiply_integers(sizes);
-  // For int8 tensors, if the device does not support int8 buffers, then int32
-  // is used instead to represent the buffer data. Therefore the number of
-  // elements in the buffer is aligned to the next multiple of 4.
-  if (is_int8 && int8_supported) {
-    return utils::align_up_4(numel);
-  }
-  return numel;
+  return calculate_gpu_buffer_numel(sizes, memory_layout, dtype);
 }
 
 template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
@@ -332,10 +408,12 @@ vkapi::VulkanImage allocate_image(
     Context* const context_ptr,
     utils::uvec3& image_extents,
     const utils::StorageType storage_type,
-    const VkFormat image_format,
+    const vkapi::ScalarType dtype,
     const bool allocate_memory) {
   vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
 
+  const VkFormat image_format = vkcompute::vkapi::to_vkformat(dtype);
+
   vkapi::ImageSampler::Properties sampler_props{
       VK_FILTER_NEAREST,
       VK_SAMPLER_MIPMAP_MODE_NEAREST,
@@ -420,6 +498,7 @@ vkapi::VulkanBuffer allocate_buffer(
 vTensorStorage::vTensorStorage(
     Context* const context,
     const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim,
     const std::vector<int64_t>& sizes,
@@ -429,20 +508,22 @@ vTensorStorage::vTensorStorage(
       storage_type_{storage_type},
       image_extents_(calculate_image_extents(
           calculate_padded_sizes(sizes, packed_dim),
+          memory_layout,
           axis_map,
           packed_dim)),
-      buffer_length_{calculate_gpu_buffer_numel(
+      buffer_length_{calculate_staging_or_gpu_buffer_numel(
           context_,
           sizes,
           image_extents_,
           storage_type,
+          memory_layout,
           dtype)},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
           image_extents_,
           storage_type_,
-          to_vkformat(dtype),
+          dtype,
           allocate_memory)),
       buffer_(allocate_buffer(
           context_,
@@ -553,7 +634,7 @@ vTensor::vTensor(
     const utils::GPUMemoryLayout memory_layout,
     const bool allocate_memory,
     const utils::AxisMapLayout axis_map_layout)
-    : dtype_(dtype),
+    : dtype_(get_effective_scalar_type(dtype, memory_layout)),
       // Calculate tensor metadata
       sizes_(sizes.begin(), sizes.end()),
       packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
@@ -576,6 +657,7 @@ vTensor::vTensor(
       storage_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
+          memory_layout,
           axis_map_,
           packed_dim_,
           sizes,
@@ -785,6 +867,16 @@ vkapi::VulkanBuffer& vTensor::buffer(
 }
 
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
+  if (dtype_ == vkapi::kInt8x4) {
+    switch (packed_dim_) {
+      case WHCN::kChannelsDim:
+        return utils::kPackedInt8_4W4C;
+      case WHCN::kWidthDim:
+        return utils::kPackedInt8_4H4W;
+      default:
+        VK_THROW("Invalid packed dim for Tensor with kInt8x4 type");
+    }
+  }
   switch (packed_dim_) {
     case WHCN::kWidthDim:
       return utils::kWidthPacked;
@@ -914,8 +1006,8 @@ void vTensor::update_metadata() {
         flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
     uniform_data_->strides_v =
         flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
-    uniform_data_->logical_limits.limits =
-        calculate_logical_limits(sizes_, axis_map_, packed_dim_);
+    uniform_data_->logical_limits.limits = calculate_logical_limits(
+        sizes_, estimate_memory_layout(), axis_map_, packed_dim_);
 
     if (sizes_uniform_offset_ != kUniformOffsetUnset) {
       uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
@@ -942,11 +1034,15 @@ void vTensor::update_metadata() {
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
+  utils::GPUMemoryLayout est_memory_layout = estimate_memory_layout();
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
     utils::uvec3 virtual_extents = calculate_image_extents(
-        calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
+        calculate_padded_sizes(sizes_, packed_dim_),
+        est_memory_layout,
+        axis_map_,
+        packed_dim_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -958,9 +1054,10 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
         valid_resize,
         "tensor sizes requires a larger texture than the current one.");
   } else {
-    // For buffer storage check that the current buffer is large enough for the
-    // new sizes of the tensor.
-    int64_t numel = utils::multiply_integers(sizes);
+    // For buffer storage check that the current buffer is large enough for
+    // the new sizes of the tensor.
+    int64_t numel =
+        calculate_gpu_buffer_numel(sizes_, est_memory_layout, dtype_);
     bool valid_resize =
         numel + storage_->buffer_offset_ <= storage_->buffer_length_;
     VK_CHECK_COND(
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 66c1fd1e4da..d9fc7784cbc 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -99,6 +99,7 @@ class vTensorStorage final {
   vTensorStorage(
       Context* context,
       const utils::StorageType storage_type,
+      const utils::GPUMemoryLayout memory_layout,
       const std::vector<int64_t>& axis_map,
       const int32_t packed_dim,
       const std::vector<int64_t>& sizes,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
new file mode 100644
index 00000000000..8b69642d2e9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define NAME ${VARIANT_NAME}
+
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+$if IO_STORAGE == "buffer":
+  #define PACKED_INT8_OUTPUT_BUFFER
+  #define PACKED_INT8_INPUT_BUFFER
+
+#define op(X, Y) ${OPERATOR}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#extension GL_EXT_debug_printf : enable
+#define DEBUG_MODE
+#include "indexing.glslh"
+#include "common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_out", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_in_a", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_in_b", "int", IO_STORAGE, is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+
+layout(push_constant) uniform restrict Block {
+  float input_a_scale;
+  int input_a_zp;
+  float input_b_scale;
+  int input_b_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int tid = int(gl_GlobalInvocationID.x);
+
+  const int W4 = div_up_4(out_sizes.x);
+  const int H = out_sizes.y;
+  const int C4 = div_up_4(out_sizes.z);
+  const int N = out_sizes.w;
+
+  if (tid >= W4 * H * C4 * N) {
+    return;
+  }
+
+  const ivec4 in_block_1 = t_packed_int8_in_a[tid];
+  const ivec4 in_block_2 = t_packed_int8_in_b[tid];
+
+  ivec4 out_block = ivec4(pack_into_int32(ivec4(output_zp)));
+
+  for (int row = 0; row < 4; row++) {
+    vec4 in_texel_1 = unpack_and_dequantize(
+        in_block_1[row], input_a_scale, input_a_zp);
+    vec4 in_texel_2 = unpack_and_dequantize(
+        in_block_2[row], input_b_scale, input_b_zp);
+
+    vec4 out_texel = op(in_texel_1, in_texel_2);
+    out_block[row] = quantize_and_pack(out_texel, output_inv_scale, output_zp);
+  }
+
+  t_packed_int8_out[tid] = out_block;
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml
new file mode 100644
index 00000000000..e19ed8839eb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+binary_q8ta_q8ta_q8to:
+  parameter_names_with_default_values:
+    OPERATOR: X + Y
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+    IO_STORAGE: buffer
+  generate_variant_forall:
+    IO_STORAGE:
+      - VALUE: buffer
+  shader_variants:
+    - NAME: add_q8ta_q8ta_q8to
+      OPERATOR: X + Y
diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
index 732b7006c2c..eb0ee02c2b4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
@@ -33,6 +33,59 @@ struct TensorIndex4D {
   ivec4 data;
 };
 
+int sign_extend_8bit(const int val) {
+  if ((val & 0x80) != 0) {
+    return val | (~0xFF);
+  }
+  return val;
+}
+
+int extract_8bit_from_packed_int_le(const int packed, const int i) {
+  // account for little endian
+  int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
+  return byte;
+}
+
+ivec4 unpack_int8x4(const int packed) {
+  return ivec4(
+    extract_8bit_from_packed_int_le(packed, 0),
+    extract_8bit_from_packed_int_le(packed, 1),
+    extract_8bit_from_packed_int_le(packed, 2),
+    extract_8bit_from_packed_int_le(packed, 3));
+}
+
+int pack_4xqint_into_int32(
+    const int val0,
+    const int val1,
+    const int val2,
+    const int val3) {
+  int packed = (val0 & 0xFF) | ((val1 & 0xFF) << 8) | ((val2 & 0xFF) << 16) |
+      ((val3 & 0xFF) << 24);
+
+  return packed;
+}
+
+int pack_into_int32(const ivec4 quant_vals) {
+  int packed = ((quant_vals[0] & 0xFF) << 0) | ((quant_vals[1] & 0xFF) << 8) |
+      ((quant_vals[2] & 0xFF) << 16) | ((quant_vals[3] & 0xFF) << 24);
+
+  return packed;
+}
+
+vec4 unpack_and_dequantize(
+    const int packed_int8_vals,
+    const float scale,
+    const int zp) {
+  ivec4 unpacked = unpack_int8x4(packed_int8_vals);
+  return vec4(unpacked - zp) * scale;
+}
+
+int quantize_and_pack(const vec4 vals, const float inv_scale, const int zp) {
+  ivec4 quantized = ivec4(round(vals * inv_scale) + zp);
+  quantized = clamp(quantized, -128, 127);
+  return pack_into_int32(quantized);
+}
+
 #ifdef DEBUG_MODE
 
 #extension GL_EXT_debug_printf : require
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
index 0f5dbc41273..88746c5594e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
@@ -60,7 +60,7 @@ void main() {
     int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y;
     start.y = ipos.y + num_steps * dilation.y;
   }
-  const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
+  const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
   // Compute the start of the kernel based on how far we are skipping ahead when
   // reading the input. Note that these are "canonical" indices.
   ivec2 kstart = (start - ipos) / dilation;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh
index 41825cba867..6f460d1398c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh
@@ -27,6 +27,60 @@ struct Conv2DParams {
   int K4;
 };
 
+struct Conv2dTensorIndex {
+  ivec3 data;
+  int texel_i;
+};
+
+struct Conv2dBlockIndex {
+  ivec3 data;
+};
+
+Conv2dTensorIndex block_idx_to_tensor_idx(const Conv2dBlockIndex block_idx) {
+  Conv2dTensorIndex tensor_idx;
+  tensor_idx.data.x = mul_4(block_idx.data.x);
+  tensor_idx.data.y = block_idx.data.y;
+  tensor_idx.data.z = block_idx.data.z;
+  tensor_idx.texel_i = 0;
+  return tensor_idx;
+}
+
+struct Conv2dBlockExtents {
+  ivec3 data;
+  int data_xz;
+};
+
+Conv2dBlockExtents make_block_extents(const ivec4 tensor_sizes) {
+  Conv2dBlockExtents block_sizes;
+  block_sizes.data.x = div_up_4(tensor_sizes.x);
+  block_sizes.data.y = tensor_sizes.y;
+  block_sizes.data.z = div_up_4(tensor_sizes.z);
+
+  block_sizes.data_xz = block_sizes.data.x * block_sizes.data.z;
+
+  return block_sizes;
+}
+
+Conv2dBlockIndex linear_idx_to_block_idx(
+    const int idx, const Conv2dBlockExtents block_extents) {
+  Conv2dBlockIndex block_idx;
+  block_idx.data.z = idx % block_extents.data.z;
+
+  const int row = idx / block_extents.data.z;
+  block_idx.data.x = row % block_extents.data.x;
+  block_idx.data.y = row / block_extents.data.x;
+
+  return block_idx;
+}
+
+bool block_idx_out_of_bounds(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents) {
+  return block_idx.data.x >= block_extents.data.x ||
+      block_idx.data.y >= block_extents.data.y ||
+      block_idx.data.z >= block_extents.data.z;
+}
+
 #ifdef DEBUG_MODE
 
 void printConv2DParams(const Conv2DParams params) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
index 02fbef29b75..9089f87d658 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
@@ -54,7 +54,7 @@ void main() {
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so reads from the padding region are skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
+  const ivec2 end = min(ipos + overlay_region.xy, in_sizes.xy);
 
   VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 19250419baf..7448b042cad 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -97,6 +97,10 @@ void main() {
   for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) {
     for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) {
       in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      // Set to zero if reading out of bounds
+      if (any(greaterThanEqual(ivec2(x, y), in_sizes.xy))) {
+        in_texels[j] = VEC4_T(0);
+      }
     }
 
     // from 2nd iteration onwards accumulate dot product in 2nd sum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh
new file mode 100644
index 00000000000..f1d90aa83cb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8_utils.glslh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_DW_Q8_UTILS_GLSLH
+#define CONV2D_DW_Q8_UTILS_GLSLH
+
+#extension GL_EXT_control_flow_attributes : require
+
+struct InputWindow1D {
+  vec4[MAX_WINDOW_WIDTH] data;
+  int len;
+};
+
+InputWindow1D initial_input_window() {
+  InputWindow1D input_window;
+  for (int i = 0; i < MAX_WINDOW_WIDTH; ++i) {
+    input_window.data[i] = vec4(0);
+  }
+  input_window.len = 0;
+  return input_window;
+}
+
+vec4 dequantize(const int packed_texel, const float scale, const int zp) {
+  return vec4(unpack_int8x4(packed_texel) - zp) * scale;
+}
+
+vec4 dequantize(const int packed_texel, const vec4 scales) {
+  return vec4(unpack_int8x4(packed_texel)) * scales;
+}
+
+bool in_bounds(
+    const int block_w,
+    const int block_h,
+    const int block_c4,
+    const Conv2dBlockExtents block_extents) {
+  ivec3 idx = ivec3(block_w, block_h, block_c4);
+  if (any(lessThan(idx, ivec3(0)))) {
+    return false;
+  }
+  if (any(greaterThanEqual(idx, block_extents.data))) {
+    return false;
+  }
+
+  return true;
+}
+
+InputWindow1D load_input_window(
+    const int w_start,
+    const int w_end,
+    const int h,
+    const int c4,
+    const Conv2dBlockExtents block_extents,
+    const float input_scale,
+    const int input_zp,
+    const ivec4 input_zps) {
+  InputWindow1D input_window = initial_input_window();
+
+  const int block_w_start = div_4(w_start);
+  const int block_w_end = div_4(w_end);
+
+  int window_i = 0;
+  for (int block_w = block_w_start; block_w <= block_w_end; ++block_w) {
+    ivec4 input_block = input_zps;
+
+    if (in_bounds(block_w, h, c4, block_extents)) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+      const int buffer_idx =
+          h * block_extents.data_xz + block_w * block_extents.data.z + c4;
+      input_block = t_packed_int8_input[buffer_idx];
+#else
+      input_block = texelFetch(t_packed_int8_input, ivec3(block_w, h, c4), 0);
+#endif
+    }
+
+    const int loaded_w_start = mul_4(block_w);
+    for (int row = 0; row < 4; ++row) {
+      if (loaded_w_start + row >= w_start && loaded_w_start + row <= w_end) {
+        input_window.data[window_i++] =
+            dequantize(input_block[row], input_scale, input_zp);
+      }
+    }
+  }
+  input_window.len = window_i;
+  return input_window;
+}
+
+struct WeightRow {
+  vec4[MAX_KERNEL_WIDTH] data;
+  int len;
+};
+
+WeightRow initial_weight_row() {
+  WeightRow weight_row;
+  for (int i = 0; i < MAX_KERNEL_WIDTH; ++i) {
+    weight_row.data[i] = vec4(0);
+  }
+  weight_row.len = 0;
+  return weight_row;
+}
+
+WeightRow load_weight_row(
+    const int oc4,
+    const int ky,
+    const int OC4,
+    const int Kw,
+    const int Kw4,
+    const vec4 weight_scales) {
+  WeightRow weight_row = initial_weight_row();
+
+  int k4 = ky * Kw4;
+  int row_idx = 0;
+  for (int w = 0; w < Kw; w += 4) {
+#ifdef WEIGHT_BUFFER
+    const ivec4 weight_block = t_packed_int8_weight[k4 * OC4 + oc4];
+#else
+    const ivec4 weight_block = texelFetch(
+        t_packed_int8_weight, ivec2(oc4, k4), 0);
+#endif
+
+    for (int row = 0; row < 4; ++row) {
+      if (w + row < Kw) {
+        weight_row.data[row_idx++] = dequantize(weight_block[row], weight_scales);
+      }
+    }
+    k4++;
+  }
+  weight_row.len = row_idx;
+  return weight_row;
+}
+
+struct FPOutBlock {
+  vec4[4] data;
+};
+
+void perform_conv1d(
+    inout FPOutBlock out_block,
+    const InputWindow1D input_window,
+    const WeightRow weight_row) {
+  for (int out_w = 0; out_w < 4; ++out_w) {
+    [[unroll]] for (int kx = 0; kx < weight_row.len; ++kx) {
+      const int in_w = out_w * conv2d_params.stride.x;
+      out_block.data[out_w] = fma(
+          input_window.data[in_w + kx],
+          weight_row.data[kx],
+          out_block.data[out_w]);
+    }
+  }
+}
+
+ivec4 quantize(
+    const vec4 texel, const float inv_scale, const int zp) {
+  vec4 quantized = round(texel * inv_scale) + zp;
+  return clamp(ivec4(quantized), -128, 127);
+}
+
+ivec4 quantize_and_pack(
+    FPOutBlock out_block, const float inv_scale, const int zp) {
+  ivec4 packed_block;
+  for (int row = 0; row < 4; ++row) {
+    ivec4 quantized_texel = quantize(out_block.data[row], inv_scale, zp);
+    packed_block[row] = pack_into_int32(quantized_texel);
+  }
+  return packed_block;
+}
+
+#ifdef DEBUG_MODE
+
+void printInputWindow1D(const InputWindow1D input_window) {
+  debugPrintfEXT("InputWindow1D contents (len = %d): \\n", input_window.len);
+  for (int i = 0; i < min(input_window.len, MAX_WINDOW_WIDTH); ++i) {
+    debugPrintfEXT(
+        "  [%d]: (%.3f, %.3f, %.3f, %.3f) \\n",
+        i,
+        input_window.data[i].x,
+        input_window.data[i].y,
+        input_window.data[i].z,
+        input_window.data[i].w);
+  }
+}
+
+void printWeightRow(const WeightRow weight_row) {
+  debugPrintfEXT("WeightRow contents (len = %d): \\n", weight_row.len);
+  for (int i = 0; i < min(weight_row.len, MAX_KERNEL_WIDTH); ++i) {
+    debugPrintfEXT(
+        "  [%d]: (%.3f, %.3f, %.3f, %.3f) \\n",
+        i,
+        weight_row.data[i].x,
+        weight_row.data[i].y,
+        weight_row.data[i].z,
+        weight_row.data[i].w);
+  }
+}
+
+void printFPOutBlock(const FPOutBlock out_block) {
+    debugPrintfEXT("FPOutBlock contents: \\n");
+    for (int i = 0; i < 4; ++i) {
+      debugPrintfEXT(
+          "  [%d]: (%.3f, %.3f, %.3f, %.3f) \\n",
+          i,
+          out_block.data[i].x,
+          out_block.data[i].y,
+          out_block.data[i].z,
+          out_block.data[i].w);
+    }
+  }
+
+#endif // DEBUG_MODE
+
+#endif // CONV2D_DW_Q8_UTILS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl
new file mode 100644
index 00000000000..8994ced3acb
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.glsl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+$if IO_STORAGE == "buffer":
+  #define PACKED_INT8_OUTPUT_BUFFER
+  #define PACKED_INT8_INPUT_BUFFER
+$if WEIGHT_STORAGE == "buffer":
+  #define WEIGHT_BUFFER
+
+#define MAX_WINDOW_WIDTH 12
+#define MAX_KERNEL_WIDTH 5
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+#include "conv2d_dw_q8_utils.glslh"
+
+void main() {
+  const int tid = int(gl_GlobalInvocationID.x);
+  Conv2dBlockExtents out_block_extents = make_block_extents(output_sizes);
+
+  Conv2dBlockIndex out_block_idx = linear_idx_to_block_idx(
+      tid, out_block_extents);
+
+  if (block_idx_out_of_bounds(out_block_idx, out_block_extents)) {
+    return;
+  }
+
+  const int out_w = mul_4(out_block_idx.data.x);
+  const int w_start =
+      (out_w * conv2d_params.stride.x) - conv2d_params.padding.x;
+  const int w_end = ((out_w + 3) * conv2d_params.stride.x) -
+      conv2d_params.padding.x +
+      (conv2d_params.kernel_size.x - 1) * conv2d_params.dilation.x;
+
+  Conv2dBlockExtents in_block_extents = make_block_extents(input_sizes);
+
+  const ivec4 input_zps = ivec4(pack_into_int32(ivec4(input_zp)));
+  const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]);
+
+  const int Kw4 = div_up_4(conv2d_params.kernel_size.x);
+
+  FPOutBlock out_block;
+  for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
+    const int out_h = out_block_idx.data.y;
+    const int h = out_h * conv2d_params.stride.y - conv2d_params.padding.y +
+        ky * conv2d_params.dilation.y;
+
+    InputWindow1D input_window = load_input_window(
+        w_start,
+        w_end,
+        h,
+        out_block_idx.data.z,
+        in_block_extents,
+        input_scale,
+        input_zp,
+        input_zps);
+
+    WeightRow weight_row = load_weight_row(
+        out_block_idx.data.z,
+        ky,
+        out_block_extents.data.z,
+        conv2d_params.kernel_size.x,
+        Kw4,
+        weight_scales);
+
+    perform_conv1d(out_block, input_window, weight_row);
+  }
+
+  if (apply_bias > 0) {
+    const vec4 bias = vec4(t_bias[out_block_idx.data.z]);
+    for (int row = 0; row < 4; row++) {
+      out_block.data[row] += bias;
+    }
+  }
+
+  const ivec4 packed_out_block = quantize_and_pack(
+      out_block, output_inv_scale, output_zp);
+
+#ifdef PACKED_INT8_OUTPUT_BUFFER
+  t_packed_int8_output[tid] = packed_out_block;
+#else
+  imageStore(t_packed_int8_output, out_block_idx.data, packed_out_block);
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml
new file mode 100644
index 00000000000..77f801668a4
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_q8ta_q8csw_q8to.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_q8ta_q8csw_q8to:
+  parameter_names_with_default_values:
+    DTYPE: float
+    IO_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+  generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
+      combos:
+        - parameter_values: [buffer, texture2d]
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw_q8ta_q8csw_q8to
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
new file mode 100644
index 00000000000..be8a76421a5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_input_tile_load.glslh
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_FP_INPUT_TILE_LOAD
+#define CONV2D_FP_INPUT_TILE_LOAD
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "linear_fp_input_tile.glslh"
+
+VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) {
+  return texelFetch(t_fp_input, tidx.data, 0);
+}
+
+void load_fp_input_tile(
+    out FPInputTile tile,
+    const Conv2dBlockIndex block_idx) {
+#if TILE_M == 4 && TILE_K4 == 1
+  Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx);
+  [[unroll]] for (int w = 0; w < TILE_M; w++) {
+    tile.data[w][0] = load_fp_input_texel(load_tidx);
+    load_tidx.data.x++;
+  }
+#else
+  not_implemented;
+#endif
+}
+
+#endif // CONV2D_FP_INPUT_TILE_LOAD
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_load.glslh
new file mode 100644
index 00000000000..44c226f6891
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_block_load.glslh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_INT8_INPUT_BLOCK_LOAD
+#define CONV2D_INT8_INPUT_BLOCK_LOAD
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "conv2d_common.glslh"
+#include "conv2d_int8_activation_block.glslh"
+
+void store_packed_int8_input_block(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents,
+    const Int8ActivationBlock packed_int8_block) {
+#ifdef OUTPUT_BUFFER
+  const int buffer_idx = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z + block_idx.data.z;
+  t_packed_int8_input[buffer_idx] = packed_int8_block.data;
+#else
+  imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data);
+#endif
+}
+
+#endif // CONV2D_INT8_INPUT_BLOCK_LOAD
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_tile_load.glslh
new file mode 100644
index 00000000000..44aa09912ec
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_input_tile_load.glslh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_INT8_INPUT_TILE_LOAD
+#define CONV2D_INT8_INPUT_TILE_LOAD
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "linear_int8_input_tile.glslh"
+
+struct Int8InputTileIndex {
+#ifdef PACKED_INT8_INPUT_BUFFER
+  int data;
+#else
+  ivec3 data;
+#endif
+};
+
+Int8InputTileIndex make_initial_int8_input_tile_index(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents) {
+  Int8InputTileIndex idx;
+#ifdef PACKED_INT8_INPUT_BUFFER
+  idx.data = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z;
+#else
+  idx.data = ivec3(block_idx.data.x, block_idx.data.y, 0);
+#endif
+  return idx;
+}
+
+Int8InputTileIndex make_initial_int8_input_tile_index(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents,
+    const int group_k4_offset) {
+  Int8InputTileIndex idx;
+#ifdef PACKED_INT8_INPUT_BUFFER
+  idx.data = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z + group_k4_offset;
+#else
+  idx.data = ivec3(block_idx.data.x, block_idx.data.y, group_k4_offset);
+#endif
+  return idx;
+}
+
+void load_packed_int8_input_tile(
+    out Int8InputTile int8_tile,
+    const Int8InputTileIndex idx) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+  int8_tile.data[0][0] = t_packed_int8_input[idx.data];
+#else
+  int8_tile.data[0][0] = texelFetch(t_packed_int8_input, idx.data, 0);
+#endif
+
+  // Guard against unsupported tile sizes
+#if TILE_M4 != 1 || TILE_K4 != 1
+  not_implemented;
+#endif
+}
+
+void increment_k4(inout Int8InputTileIndex idx) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+  idx.data += 1;
+#else
+  idx.data.z += 1;
+#endif
+}
+
+#endif // CONV2D_INT8_INPUT_TILE_LOAD
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh
new file mode 100644
index 00000000000..0a490360f98
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_int8_output_tile_store.glslh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_INT8_OUTPUT_TILE_STORE
+#define CONV2D_INT8_OUTPUT_TILE_STORE
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "conv2d_common.glslh"
+#include "linear_int8_output_tile.glslh"
+
+void store_packed_int8_output_tile(
+    const Int8OutTile int8_tile,
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents) {
+#ifdef PACKED_INT8_OUTPUT_BUFFER
+  [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
+    int buffer_idx = block_idx.data.y * block_extents.data_xz +
+        (block_idx.data.x + m4) * block_extents.data.z + block_idx.data.z;
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; n4++) {
+      if (block_idx.data.x + m4 < block_extents.data.x &&
+          block_idx.data.z + n4 < block_extents.data.z) {
+        t_packed_int8_output[buffer_idx++] = int8_tile.data[m4][n4];
+      }
+    }
+  }
+#else
+  [[unroll]] for (int m4 = 0; m4 < TILE_M4; m4++) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; n4++) {
+      if (block_idx.data.x + m4 < block_extents.data.x &&
+          block_idx.data.z + n4 < block_extents.data.z) {
+        imageStore(
+            t_packed_int8_output, block_idx.data, int8_tile.data[m4][n4]);
+      }
+    }
+  }
+#endif
+}
+
+#endif // CONV2D_INT8_OUTPUT_TILE_STORE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.glsl
new file mode 100644
index 00000000000..16c12b3ee5a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.glsl
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+$if IO_STORAGE == "buffer":
+  #define PACKED_INT8_OUTPUT_BUFFER
+  #define PACKED_INT8_INPUT_BUFFER
+$if WEIGHT_STORAGE == "buffer":
+  #define WEIGHT_BUFFER
+
+// corresponds to input/output width dim
+#define TILE_M4 1
+// corresponds to input channels dim
+#define TILE_K4 1
+// corresponds to output channels dim
+#define TILE_N4 2
+
+#define TILE_M 4
+#define TILE_K 4
+#define TILE_N 8
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+#include "conv2d_int8_input_tile_load.glslh"
+#include "linear_int8_weight_tile_load.glslh"
+#include "linear_fp_output_tile_int8_int8_compute.glslh"
+#include "linear_int_weight_sums_load.glslh"
+#include "linear_fp_weight_scales_load.glslh"
+#include "linear_fp_bias_load.glslh"
+#include "linear_int8_output_tile_compute.glslh"
+#include "conv2d_int8_output_tile_store.glslh"
+
+void main() {
+  Conv2dBlockIndex output_block_idx;
+  output_block_idx.data.z = int(gl_GlobalInvocationID.x) * TILE_N4;
+  output_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4;
+  output_block_idx.data.y = int(gl_GlobalInvocationID.z);
+
+  Conv2dBlockExtents output_block_extents = make_block_extents(output_sizes);
+  if (block_idx_out_of_bounds(output_block_idx, output_block_extents)) {
+    return;
+  }
+
+  Conv2dBlockExtents input_block_extents = make_block_extents(input_sizes);
+
+  Int32Accum out_accum;
+  initialize(out_accum);
+
+  Int8InputTile int8_input_tile;
+  Int8WeightTile int8_weight_tile;
+
+  Int8InputTileIndex input_idx = make_initial_int8_input_tile_index(
+      output_block_idx, input_block_extents);
+
+  for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
+    load_packed_int8_input_tile(int8_input_tile, input_idx);
+
+    load_int8_weight_tile(
+        int8_weight_tile,
+        output_block_idx.data.z,
+        k4,
+        output_block_extents.data.z);
+
+    int_accumulate_with_int8_weight(
+        out_accum, int8_input_tile, int8_weight_tile);
+
+    increment_k4(input_idx);
+  }
+
+  FPPerOutChannelParams weight_scales_tile;
+  load_weight_scales_tile(weight_scales_tile, output_block_idx.data.z);
+
+  IntPerOutChannelParams weight_sums_tile;
+  load_weight_sums_tile(weight_sums_tile, output_block_idx.data.z);
+
+  Int8OutTile int8_out_tile;
+  initialize(int8_out_tile);
+
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, output_block_idx.data.z);
+
+    compute_int8_out_tile_with_int32_accum(
+        int8_out_tile,
+        out_accum,
+        input_scale,
+        input_zp,
+        output_inv_scale,
+        output_zp,
+        weight_sums_tile,
+        weight_scales_tile,
+        bias_tile);
+  }
+  else {
+    compute_int8_out_tile_with_int32_accum(
+        int8_out_tile,
+        out_accum,
+        input_scale,
+        input_zp,
+        output_inv_scale,
+        output_zp,
+        weight_sums_tile,
+        weight_scales_tile);
+  }
+
+  store_packed_int8_output_tile(
+      int8_out_tile, output_block_idx, output_block_extents);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.yaml
new file mode 100644
index 00000000000..23803dc6da1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_q8ta_q8csw_q8to_tiled.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_pw_q8ta_q8csw_q8to_tiled:
+  parameter_names_with_default_values:
+    DTYPE: float
+    IO_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+  generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
+      combos:
+        - parameter_values: [buffer, texture2d]
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_pw_q8ta_q8csw_q8to_tiled
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
index 9f84afeb1a1..ef50a1aca9f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
@@ -12,10 +12,12 @@
 
 #define PRECISION ${PRECISION}
 
-#define VEC4_T ${texel_type(DTYPE)}
+$if DTYPE == "half":
+  #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+  #define VEC4_T f16vec4
+$else:
+  #define VEC4_T ${texel_type(DTYPE)}
 
-#define TILE_SIZE_X uint16_t(${TILE_SIZE_X})
-#define TILE_SIZE_Y uint16_t(${TILE_SIZE_Y})
 
 #define op(X, A, B) ${OPERATOR}
 
@@ -50,119 +52,90 @@ ${layout_declare_spec_const(C, "int", "ngroups", "1")}
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
-  const int out_limits_scaled[2] =
-    {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X,
-     (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y};
 
-  const uint16_t div_by_x = uint16_t(gl_GlobalInvocationID.x / out_limits_scaled[0]);
-  const uint16_t out_pos_xy[2] = {uint16_t(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x};
-  const int out_pos_z = int(gl_GlobalInvocationID.y);
+  int inputAndOutputWidth = out_limits.x;
+  int inputAndOutputHeight = out_limits.y;
+  int outputChannel = out_limits.z*4;
 
-  // If the top left position is out of bounds, then this invocation will have
-  // no work to do.
-  if (out_pos_xy[1] >= out_limits_scaled[1] || out_pos_z >= out_limits.z) {
+  // Divided by 4 because the input channels are packed
+  int inputChannel = in_group_size/4;
+
+  int threadHW = int(gl_GlobalInvocationID.x);
+  int threadOutChannel = int(gl_GlobalInvocationID.y);
+
+  int xIdx = threadHW % inputAndOutputWidth;
+  int yIdx = threadHW / inputAndOutputWidth;
+
+  if (threadHW >= inputAndOutputWidth * inputAndOutputHeight && threadOutChannel >= outputChannel) {
     return;
   }
 
-  // Output position for TILE_SIZE = 2
-  // +--------+--------+
-  // | pos[0] | pos[1] |
-  // +--------+--------+
-  // | pos[2] | pos[3] |
-  // +--------+--------+
-  uint16_t pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
-  for (uint16_t y = uint16_t(0), i = uint16_t(0); y < TILE_SIZE_Y; ++y) {
-    for (uint16_t x = uint16_t(0); x < TILE_SIZE_X; ++x) {
-      pos[i * 2] = out_pos_xy[0] * TILE_SIZE_X + x;
-      pos[i * 2 + 1] = out_pos_xy[1] * TILE_SIZE_Y + y;
-      i++;
-    }
-  }
+  VEC4_T outputTexel = VEC4_T(texelFetch(t_bias, ivec2(threadOutChannel, 0), 0));
 
-  // Final output array where each element is a tensor value.
-  // Tuple of consecutive 4 elements represents a single output texel.
-  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
+  VEC4_T inputVec;
+  VEC4_T weight1OutputChannelPacked;
+  VEC4_T weight2OutputChannelPacked;
+  VEC4_T weight3OutputChannelPacked;
+  VEC4_T weight4OutputChannelPacked;
 
-  // Initialize the output array with the bias value
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i++) {
-    sum[i] = 0;
-  }
+  // By unrolling the loop in sets of 4, this significantly reduces the number of branching instructions
+  // and enables the compiler to rearrange instructions for more efficient memory retrieval and compute
+  for (int inputC = 0; inputC < inputChannel; inputC += 1) {
 
-  int z4 = 0;
-  // Since the kernel is 1x1, we only have to loop over the depth dimension.
-  for (int z = 0; z < in_group_size; z += 4, ++z4) {
-    // During prepacking, the weight tensor has been permuted so that the
-    // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
-    // the z-axis.
-    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
-
-    // Load kernel values from texels to array
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos_z), 0);
-      kernel_values[i * 4 + 0] = k_tex.x;
-      kernel_values[i * 4 + 1] = k_tex.y;
-      kernel_values[i * 4 + 2] = k_tex.z;
-      kernel_values[i * 4 + 3] = k_tex.w;
-    }
-
-    for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-      const vec4 in_tex = texelFetch(t_in, ivec3(pos[i * 2], pos[i * 2 + 1], z4), 0);
-      // Load the input texel into an array
-      float tex_values[4];
-      tex_values[0] = in_tex.x;
-      tex_values[1] = in_tex.y;
-      tex_values[2] = in_tex.z;
-      tex_values[3] = in_tex.w;
-
-      // For 2x2 tile size algorithm works as follows.
-      // To explain the calculations below, the contents of one in_tex and the
-      // group of 4 texels loaded from t_kernel are shown:
-      //
-      //   in_tex                 t_kernel
-      //    -x->                   ---x--->
-      //   +---+              +----+----+----+----+
-      // ^ | w |           ^  | D0 | D1 | D2 | D3 |
-      // | +---+           |  +----+----+----+----+
-      // | | z |           |  | C0 | C1 | C2 | C3 |
-      // z +---+           z  +----+----+----+----+
-      // | | y |           |  | B0 | B2 | B2 | B3 |
-      // | +---+           |  +----+----+----+----+
-      //   | x |              | A0 | A1 | A2 | A3 |
-      //   +---+              +----+----+----+----+
-      //
-      // In the t_kernel graphic, cells sharing the same letter are from
-      // the same batch/output channel index, and the number denotes a unique
-      // channel index. To calculate the output texel, the following
-      // calculation is performed:
-      //
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
-      //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
-      //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //
-      //  which is what is expressed in the following calculations. This is done
-      //  for each output position.
-      for (int j = 0; j < 4; ++j) {
-        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
-      }
-    }
-  }
+    inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0));
+
+    weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0));
+    weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0));
+    weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0));
+    weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0));
+
+    outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0]));
+    outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1]));
+    outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2]));
+    outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3]));
+
+    inputC += 1;
+
+    inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0));
 
-  const vec4 bias = texelFetch(t_bias, ivec2(out_pos_z, 0), 0);
+    weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0));
+    weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0));
+    weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0));
+    weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0));
 
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos_z);
-    if (all(lessThan(pos_l.xy, out_limits.xy))) {
-      const vec4 out_sum = vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]);
-      imageStore(t_out, pos_l, op(out_sum + bias, out_min, out_max));
-    }
+    outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0]));
+    outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1]));
+    outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2]));
+    outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3]));
+
+    inputC += 1;
+
+    inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0));
+
+    weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0));
+    weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0));
+    weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0));
+    weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0));
+
+    outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0]));
+    outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1]));
+    outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2]));
+    outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3]));
+
+    inputC += 1;
+
+    inputVec = VEC4_T(texelFetch(t_in, ivec3(xIdx, yIdx, inputC), 0));
+
+    weight1OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 0, threadOutChannel), 0));
+    weight2OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 1, threadOutChannel), 0));
+    weight3OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 2, threadOutChannel), 0));
+    weight4OutputChannelPacked = VEC4_T(texelFetch(t_kernel, ivec2(inputC * 4 + 3, threadOutChannel), 0));
+
+    outputTexel[0] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[0], weight2OutputChannelPacked[0], weight3OutputChannelPacked[0], weight4OutputChannelPacked[0]));
+    outputTexel[1] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[1], weight2OutputChannelPacked[1], weight3OutputChannelPacked[1], weight4OutputChannelPacked[1]));
+    outputTexel[2] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[2], weight2OutputChannelPacked[2], weight3OutputChannelPacked[2], weight4OutputChannelPacked[2]));
+    outputTexel[3] += dot(inputVec, VEC4_T(weight1OutputChannelPacked[3], weight2OutputChannelPacked[3], weight3OutputChannelPacked[3], weight4OutputChannelPacked[3]));
   }
+
+  imageStore(t_out, ivec3(xIdx, yIdx, threadOutChannel), op(vec4(outputTexel), out_min, out_max));
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
index ebfee11c405..bab3c715540 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
@@ -9,8 +9,6 @@ conv2d_pw_s1p0:
     OPERATOR: X
     NDIM: 3
     DTYPE: float
-    TILE_SIZE_X: 1
-    TILE_SIZE_Y: 4
   generate_variant_forall:
     DTYPE:
       - VALUE: half
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8_utils.glslh
new file mode 100644
index 00000000000..279f4f17f13
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8_utils.glslh
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef CONV2D_Q8_UTILS_GLSLH
+#define CONV2D_Q8_UTILS_GLSLH
+
+#extension GL_EXT_control_flow_attributes : require
+#extension GL_EXT_integer_dot_product : require
+
+#include "linear_int_accumulator.glslh"
+
+struct Int8InputWindow1D {
+  int[MAX_WINDOW_WIDTH] data;
+  int len;
+};
+
+Int8InputWindow1D initial_input_window() {
+  Int8InputWindow1D input_window;
+  for (int i = 0; i < MAX_WINDOW_WIDTH; ++i) {
+    input_window.data[i] = 0;
+  }
+  input_window.len = 0;
+  return input_window;
+}
+
+bool in_bounds(
+    const int block_w,
+    const int block_h,
+    const int block_c4,
+    const Conv2dBlockExtents block_extents) {
+  ivec3 idx = ivec3(block_w, block_h, block_c4);
+  if (any(lessThan(idx, ivec3(0)))) {
+    return false;
+  }
+  if (any(greaterThanEqual(idx, block_extents.data))) {
+    return false;
+  }
+
+  return true;
+}
+
+Int8InputWindow1D load_input_window(
+    const int w_start,
+    const int w_end,
+    const int h,
+    const int c4,
+    const Conv2dBlockExtents block_extents,
+    const ivec4 input_zps) {
+  Int8InputWindow1D input_window = initial_input_window();
+
+  const int block_w_start = div_4(w_start);
+  const int block_w_end = div_4(w_end);
+
+  int window_i = 0;
+  for (int block_w = block_w_start; block_w <= block_w_end; ++block_w) {
+    ivec4 input_block = input_zps;
+
+    if (in_bounds(block_w, h, c4, block_extents)) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+      const int buffer_idx =
+          h * block_extents.data_xz + block_w * block_extents.data.z + c4;
+      input_block = t_packed_int8_input[buffer_idx];
+#else
+      input_block = texelFetch(t_packed_int8_input, ivec3(block_w, h, c4), 0);
+#endif
+    }
+
+    const int loaded_w_start = mul_4(block_w);
+    for (int row = 0; row < 4; ++row) {
+      if (loaded_w_start + row >= w_start && loaded_w_start + row <= w_end) {
+        input_window.data[window_i++] = input_block[row];
+      }
+    }
+  }
+  input_window.len = window_i;
+  return input_window;
+}
+
+ivec4 load_weight_block(
+    const int ic4,
+    const int kx,
+    const int ky,
+    const int oc4,
+    const int IC4,
+    const int Kw,
+    const int Kh,
+    const int OC4) {
+#ifdef PACKED_INT8_WEIGHTS_BUFFER
+  const int block_x = oc4 * Kw + kx;
+  const int block_y = ky * IC4 + ic4;
+  return t_packed_int8_weight[block_y * (Kw * OC4) + block_x];
+#else
+  return texelFetch(
+      t_packed_int8_weight, ivec2(oc4 * Kw + kx, ky * IC4 + ic4), 0);
+#endif
+}
+
+void perform_conv1d(
+    inout Int32Accum accum,
+    const Int8InputWindow1D input_window,
+    const ivec4 weight_block,
+    const int kx) {
+  [[unroll]] for (int out_w = 0; out_w < 4; ++out_w) {
+    const int window_i = out_w * conv2d_params.stride.x + kx;
+    [[unroll]] for (int out_c = 0; out_c < 4; ++out_c) {
+      accum.data[out_w][0][out_c] = dotPacked4x8AccSatEXT(
+          input_window.data[window_i],
+          weight_block[out_c],
+          accum.data[out_w][0][out_c]);
+    }
+  }
+}
+
+#ifdef DEBUG_MODE
+
+void printInt8InputWindow1D(const Int8InputWindow1D input_window) {
+  debugPrintfEXT("Int8InputWindow1D contents (len = %d): \\n", input_window.len);
+  for (int i = 0; i < min(input_window.len, MAX_WINDOW_WIDTH); ++i) {
+    ivec4 unpacked = unpack_int8x4(input_window.data[i]);
+    debugPrintfEXT(
+        "  [%d]: (%d, %d, %d, %d) \\n",
+        i,
+        unpacked.x,
+        unpacked.y,
+        unpacked.z,
+        unpacked.w);
+  }
+}
+
+void printWeightBlock(const ivec4 weight_block) {
+  debugPrintfEXT("WeightBlock contents: \\n");
+  for (int i = 0; i < 4; ++i) {
+    ivec4 unpacked = unpack_int8x4(weight_block[i]);
+    debugPrintfEXT(
+        "  [%d]: (%d, %d, %d, %d) \\n",
+        i,
+        unpacked.x,
+        unpacked.y,
+        unpacked.z,
+        unpacked.w);
+  }
+}
+
+#endif // DEBUG_MODE
+
+#endif // CONV2D_Q8_UTILS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.glsl
new file mode 100644
index 00000000000..5839b13aeaa
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.glsl
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+$if IO_STORAGE == "buffer":
+  #define PACKED_INT8_OUTPUT_BUFFER
+  #define PACKED_INT8_INPUT_BUFFER
+$if WEIGHT_STORAGE == "buffer":
+  #define WEIGHT_BUFFER
+
+#define MAX_WINDOW_WIDTH 16
+
+// corresponds to input/output width dim
+#define TILE_M4 1
+// corresponds to input channels dim
+#define TILE_K4 1
+// corresponds to output channels dim
+#define TILE_N4 1
+
+#define TILE_M 4
+#define TILE_K 4
+#define TILE_N 4
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+#include "im2col_packed_int8_utils.glslh"
+#include "conv2d_int8_input_tile_load.glslh"
+#include "linear_int8_weight_tile_load.glslh"
+#include "linear_fp_output_tile_int8_int8_compute.glslh"
+#include "linear_int_weight_sums_load.glslh"
+#include "linear_fp_weight_scales_load.glslh"
+#include "linear_fp_bias_load.glslh"
+#include "linear_int8_output_tile_compute.glslh"
+#include "conv2d_int8_output_tile_store.glslh"
+
+#include "conv2d_q8_utils.glslh"
+
+void main() {
+  Conv2dBlockIndex out_block_idx;
+  out_block_idx.data.z = int(gl_GlobalInvocationID.x) * TILE_N4;
+  out_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4;
+  out_block_idx.data.y = int(gl_GlobalInvocationID.z);
+
+  Conv2dBlockExtents out_block_extents = make_block_extents(output_sizes);
+  if (block_idx_out_of_bounds(out_block_idx, out_block_extents)) {
+    return;
+  }
+
+  const int out_w = mul_4(out_block_idx.data.x);
+  const int w_start =
+      (out_w * conv2d_params.stride.x) - conv2d_params.padding.x;
+  const int w_end = ((out_w + 3) * conv2d_params.stride.x) -
+      conv2d_params.padding.x +
+      (conv2d_params.kernel_size.x - 1) * conv2d_params.dilation.x;
+
+  Conv2dBlockExtents in_block_extents = make_block_extents(input_sizes);
+
+  const ivec4 input_zps = ivec4(pack_into_int32(ivec4(input_zp)));
+  const vec4 weight_scales = vec4(t_weight_scales[out_block_idx.data.z]);
+
+  Int32Accum out_accum;
+  initialize(out_accum);
+
+  const int IC4_per_group = div_up_4(conv2d_params.in_channels_per_group);
+
+  const int n = mul_4(out_block_idx.data.z);
+  const int group_idx = n / conv2d_params.out_channels_per_group;
+  const int group_ic4_offset = group_idx * IC4_per_group;
+
+  for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
+    const int h = out_block_idx.data.y * conv2d_params.stride.y -
+        conv2d_params.padding.y + ky * conv2d_params.dilation.y;
+
+    for (int ic4 = 0; ic4 < IC4_per_group; ic4++) {
+      Int8InputWindow1D int8_input_window = load_input_window(
+          w_start,
+          w_end,
+          h,
+          group_ic4_offset + ic4,
+          in_block_extents,
+          input_zps);
+
+      for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
+        const ivec4 weight_block = load_weight_block(
+            ic4,
+            kx,
+            ky,
+            out_block_idx.data.z,
+            IC4_per_group,
+            conv2d_params.kernel_size.x,
+            conv2d_params.kernel_size.y,
+            out_block_extents.data.z);
+
+        perform_conv1d(out_accum, int8_input_window, weight_block, kx);
+      }
+    }
+  }
+
+  FPPerOutChannelParams weight_scales_tile;
+  load_weight_scales_tile(weight_scales_tile, out_block_idx.data.z);
+
+  IntPerOutChannelParams weight_sums_tile;
+  load_weight_sums_tile(weight_sums_tile, out_block_idx.data.z);
+
+  Int8OutTile int8_out_tile;
+  initialize(int8_out_tile);
+
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, out_block_idx.data.z);
+
+    compute_int8_out_tile_with_int32_accum(
+        int8_out_tile,
+        out_accum,
+        input_scale,
+        input_zp,
+        output_inv_scale,
+        output_zp,
+        weight_sums_tile,
+        weight_scales_tile,
+        bias_tile);
+  }
+  else {
+    compute_int8_out_tile_with_int32_accum(
+        int8_out_tile,
+        out_accum,
+        input_scale,
+        input_zp,
+        output_inv_scale,
+        output_zp,
+        weight_sums_tile,
+        weight_scales_tile);
+  }
+
+  store_packed_int8_output_tile(
+      int8_out_tile, out_block_idx, out_block_extents);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml
new file mode 100644
index 00000000000..5da9cc14584
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_q8ta_q8csw_q8to:
+  parameter_names_with_default_values:
+    DTYPE: float
+    IO_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+  generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
+      combos:
+        - parameter_values: [buffer, texture2d]
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_q8ta_q8csw_q8to
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.glsl
new file mode 100644
index 00000000000..b44e37766fc
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.glsl
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, "buffer")}
+#define T ${texel_load_component_type(DTYPE, "buffer")}
+
+$if IO_STORAGE == "buffer":
+  #define PACKED_INT8_OUTPUT_BUFFER
+  #define PACKED_INT8_INPUT_BUFFER
+$if WEIGHT_STORAGE == "buffer":
+  #define WEIGHT_BUFFER
+
+// corresponds to input/output width dim
+#define TILE_M4 1
+// corresponds to input channels dim
+#define TILE_K4 1
+// corresponds to output channels dim
+#define TILE_N4 2
+
+#define TILE_M 4
+#define TILE_K 4
+#define TILE_N 8
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", IO_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+${layout_declare_ubo(B, "ivec4", "im2col_sizes")}
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float input_scale;
+  int input_zp;
+  float output_inv_scale;
+  int output_zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "apply_bias", "1")}
+
+#include "conv2d_int8_input_tile_load.glslh"
+#include "linear_int8_weight_tile_load.glslh"
+#include "linear_fp_output_tile_int8_int8_compute.glslh"
+#include "linear_int_weight_sums_load.glslh"
+#include "linear_fp_weight_scales_load.glslh"
+#include "linear_fp_bias_load.glslh"
+#include "linear_int8_output_tile_compute.glslh"
+#include "conv2d_int8_output_tile_store.glslh"
+
+void main() {
+  Conv2dBlockIndex output_block_idx;
+  output_block_idx.data.z = int(gl_GlobalInvocationID.x) * TILE_N4;
+  output_block_idx.data.x = int(gl_GlobalInvocationID.y) * TILE_M4;
+  output_block_idx.data.y = int(gl_GlobalInvocationID.z);
+
+  Conv2dBlockExtents output_block_extents = make_block_extents(output_sizes);
+  if (block_idx_out_of_bounds(output_block_idx, output_block_extents)) {
+    return;
+  }
+
+  const int n = mul_4(output_block_idx.data.z);
+
+  const int group_idx = n / conv2d_params.out_channels_per_group;
+  const int group_k4_offset = group_idx * conv2d_params.K4_per_group;
+
+  Conv2dBlockExtents input_block_extents = make_block_extents(im2col_sizes);
+
+  Int32Accum out_accum;
+  initialize(out_accum);
+
+  Int8InputTile int8_input_tile;
+  Int8WeightTile int8_weight_tile;
+
+  Int8InputTileIndex input_idx = make_initial_int8_input_tile_index(
+      output_block_idx, input_block_extents, group_k4_offset);
+
+  for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
+    load_packed_int8_input_tile(int8_input_tile, input_idx);
+
+    load_int8_weight_tile(
+        int8_weight_tile,
+        output_block_idx.data.z,
+        k4,
+        output_block_extents.data.z);
+
+    int_accumulate_with_int8_weight(
+        out_accum, int8_input_tile, int8_weight_tile);
+
+    increment_k4(input_idx);
+  }
+
+  FPPerOutChannelParams weight_scales_tile;
+  load_weight_scales_tile(weight_scales_tile, output_block_idx.data.z);
+
+  IntPerOutChannelParams weight_sums_tile;
+  load_weight_sums_tile(weight_sums_tile, output_block_idx.data.z);
+
+  Int8OutTile int8_out_tile;
+  initialize(int8_out_tile);
+
+  if (apply_bias > 0) {
+    FPPerOutChannelParams bias_tile;
+    load_bias_tile(bias_tile, output_block_idx.data.z);
+
+    compute_int8_out_tile_with_int32_accum(
+        int8_out_tile,
+        out_accum,
+        input_scale,
+        input_zp,
+        output_inv_scale,
+        output_zp,
+        weight_sums_tile,
+        weight_scales_tile,
+        bias_tile);
+  }
+  else {
+    compute_int8_out_tile_with_int32_accum(
+        int8_out_tile,
+        out_accum,
+        input_scale,
+        input_zp,
+        output_inv_scale,
+        output_zp,
+        weight_sums_tile,
+        weight_scales_tile);
+  }
+
+  store_packed_int8_output_tile(
+      int8_out_tile, output_block_idx, output_block_extents);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml
new file mode 100644
index 00000000000..fa92481f5ef
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_q8to_linear_tiled.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_q8ta_q8csw_q8to_linear_tiled:
+  parameter_names_with_default_values:
+    DTYPE: float
+    IO_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+  generate_variant_forall:
+    combination:
+      parameter_names: [IO_STORAGE, WEIGHT_STORAGE]
+      combos:
+        - parameter_values: [buffer, texture2d]
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_q8ta_q8csw_q8to_linear_tiled
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
index eff78a7938d..1a5b0cb235e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
@@ -14,5 +14,6 @@ full:
     DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: int32
   shader_variants:
     - NAME: full
diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.glsl b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.glsl
new file mode 100644
index 00000000000..3ecaa597ecc
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.glsl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+$if STORAGE == "buffer":
+  #define PACKED_INT8_OUTPUT_BUFFER
+  #define PACKED_INT8_INPUT_BUFFER
+
+#define TILE_M4 1
+#define TILE_N4 1
+#define TILE_K4 1
+
+#define TILE_M 4
+#define TILE_N 4
+#define TILE_K 4
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", STORAGE, is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "im2col_sizes")}
+// Sizes of the output image
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+// Sizes of the input image
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+
+${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
+
+layout(push_constant) uniform restrict Block {
+  float inv_scale;
+  int zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "conv2d_int8_output_tile_store.glslh"
+#include "im2col_packed_int8_utils.glslh"
+
+void main() {
+  const int out_buf_idx = int(gl_GlobalInvocationID.x);
+  Conv2dBlockExtents im2col_block_extents = make_block_extents(im2col_sizes);
+
+  Conv2dBlockIndex im2col_block_idx = linear_idx_to_block_idx(
+      out_buf_idx, im2col_block_extents);
+
+  if (block_idx_out_of_bounds(im2col_block_idx, im2col_block_extents)) {
+    return;
+  }
+
+  Im2ColBlockLoadIndices load_ixs = im2col_block_idx_to_load_ixs(
+      im2col_block_idx);
+
+  Conv2dBlockExtents input_block_extents = make_block_extents(input_sizes);
+
+  const ivec4 input_zps = ivec4(pack_into_int32(ivec4(zp)));
+  Int8OutTile int8_im2col_tile;
+  int8_im2col_tile.data[0][0] = load_im2col_block(
+      load_ixs, input_block_extents, zp, input_zps);
+
+  store_packed_int8_output_tile(
+      int8_im2col_tile, im2col_block_idx, im2col_block_extents);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml
new file mode 100644
index 00000000000..1c14f1fdc5a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+im2col_packed_int8:
+  parameter_names_with_default_values:
+    STORAGE: buffer
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+  shader_variants:
+    - NAME: im2col_packed_int8
diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8_utils.glslh
new file mode 100644
index 00000000000..2b1870c493d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/im2col_packed_int8_utils.glslh
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef IM2COL_PACKED_INT8_GLSLH
+#define IM2COL_PACKED_INT8_GLSLH
+
+#include "common.glslh"
+
+struct Conv2dBlockElementIndex {
+  int x4;
+  int y;
+  int z4;
+
+  int row;
+  int col;
+};
+
+struct Im2ColBlockLoadIndices {
+  bool block_aligned;
+  bool cols_aligned;
+  bool rows_contiguous;
+
+  int im2col_w_start;
+  int im2col_h;
+  int k_in_group_start;
+  int group_idx;
+
+  Conv2dBlockElementIndex block_idx_start;
+};
+
+Conv2dBlockElementIndex tidx_to_block_elem_idx(const TensorIndex4D tidx) {
+  Conv2dBlockElementIndex block_idx;
+  block_idx.x4 = div_4(tidx.data.x);
+  block_idx.row = mod_4(tidx.data.x);
+
+  block_idx.y = tidx.data.y;
+
+  block_idx.z4 = div_4(tidx.data.z);
+  block_idx.col = mod_4(tidx.data.z);
+
+  return block_idx;
+}
+
+TensorIndex4D get_input_tensor_tidx(
+    const int w,
+    const int h,
+    const int k_in_group,
+    const int group_idx) {
+  TensorIndex4D tidx;
+  tidx.data.w = 0;
+
+  const int c_in_group = k_in_group % conv2d_params.in_channels_per_group;
+  const int row = k_in_group / conv2d_params.in_channels_per_group;
+  const int kernel_x = row % conv2d_params.kernel_size.x;
+  const int kernel_y = row / conv2d_params.kernel_size.x;
+
+  tidx.data.z = group_idx * conv2d_params.in_channels_per_group + c_in_group;
+
+  tidx.data.x = (w * conv2d_params.stride.x) - conv2d_params.padding.x +
+      (kernel_x * conv2d_params.dilation.x);
+  tidx.data.y = (h * conv2d_params.stride.y) - conv2d_params.padding.y +
+      (kernel_y * conv2d_params.dilation.y);
+
+  return tidx;
+}
+
+Im2ColBlockLoadIndices im2col_block_idx_to_load_ixs(
+    Conv2dBlockIndex im2col_block_idx) {
+  const int im2col_w = mul_4(im2col_block_idx.data.x);
+  const int im2col_h = im2col_block_idx.data.y;
+  const int im2col_k = mul_4(im2col_block_idx.data.z);
+
+  const int group_idx = im2col_k / conv2d_params.K_per_group;
+  const int k_in_group = im2col_k % conv2d_params.K_per_group;
+
+  TensorIndex4D input_tidx =
+      get_input_tensor_tidx(im2col_w, im2col_h, k_in_group, group_idx);
+
+  bool cols_aligned = (mod_4(input_tidx.data.z) == 0) &&
+      (input_tidx.data.z + 3 < conv2d_params.in_channels_per_group);
+
+  bool rows_aligned = mod_4(input_tidx.data.x) == 0;
+  bool rows_contiguous = conv2d_params.stride.x == 1;
+
+  Im2ColBlockLoadIndices load_ixs;
+  load_ixs.block_aligned = cols_aligned && rows_aligned && rows_contiguous;
+  load_ixs.cols_aligned = cols_aligned;
+  load_ixs.rows_contiguous = rows_contiguous;
+
+  load_ixs.im2col_w_start = im2col_w;
+  load_ixs.im2col_h = im2col_h;
+  load_ixs.k_in_group_start = k_in_group;
+  load_ixs.group_idx = group_idx;
+
+  load_ixs.block_idx_start = tidx_to_block_elem_idx(input_tidx);
+
+  return load_ixs;
+}
+
+bool is_block_elem_idx_in_bounds(
+    const Conv2dBlockElementIndex idx,
+    const Conv2dBlockExtents block_extents) {
+  const ivec3 block_idx = ivec3(idx.x4, idx.y, idx.z4);
+  if (any(lessThan(block_idx, ivec3(0))) ||
+      any(greaterThanEqual(block_idx, block_extents.data))) {
+    return false;
+  }
+  return true;
+}
+
+int load_packed_int8_input_element(
+    const Conv2dBlockElementIndex idx,
+    const Conv2dBlockExtents block_extents,
+    const int input_zp) {
+  // bounds checking
+  if (!is_block_elem_idx_in_bounds(idx, block_extents)) {
+    return input_zp;
+  }
+#ifdef PACKED_INT8_INPUT_BUFFER
+  const int buf_idx =
+      idx.y * block_extents.data_xz + idx.x4 * block_extents.data.z + idx.z4;
+  const ivec4 tile = t_packed_int8_input[buf_idx];
+#else
+  const ivec4 tile =
+      texelFetch(t_packed_int8_input, ivec3(idx.x4, idx.y, idx.z4), 0);
+#endif
+  return extract_8bit_from_packed_int_le(tile[idx.row], idx.col);
+}
+
+Conv2dBlockElementIndex get_packed_int8_input_element_idx(
+    const int im2col_w,
+    const int im2col_h,
+    const int k_in_group,
+    const int group_idx) {
+  TensorIndex4D input_tidx =
+      get_input_tensor_tidx(im2col_w, im2col_h, k_in_group, group_idx);
+
+  return tidx_to_block_elem_idx(input_tidx);
+}
+
+ivec4 load_im2col_block_aligned(
+    const Im2ColBlockLoadIndices load_ixs,
+    const Conv2dBlockExtents block_extents) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+  const int buf_idx = load_ixs.block_idx_start.y * block_extents.data_xz +
+      load_ixs.block_idx_start.x4 * block_extents.data.z +
+      load_ixs.block_idx_start.z4;
+  return t_packed_int8_input[buf_idx];
+#else
+  return texelFetch(
+      t_packed_int8_input,
+      ivec3(
+          load_ixs.block_idx_start.x4,
+          load_ixs.block_idx_start.y,
+          load_ixs.block_idx_start.z4),
+      0);
+#endif
+}
+
+ivec4 load_im2col_block_c_aligned_w_contiguous(
+    const Im2ColBlockLoadIndices load_ixs,
+    const Conv2dBlockExtents block_extents,
+    const ivec4 input_zps) {
+  ivec4 im2col_block;
+  Conv2dBlockElementIndex block_elem_idx = load_ixs.block_idx_start;
+
+#ifdef PACKED_INT8_INPUT_BUFFER
+  int buf_idx = load_ixs.block_idx_start.y * block_extents.data_xz +
+      load_ixs.block_idx_start.x4 * block_extents.data.z +
+      load_ixs.block_idx_start.z4;
+#endif
+
+  ivec4 in_block = input_zps;
+  if (is_block_elem_idx_in_bounds(block_elem_idx, block_extents)) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+    in_block = t_packed_int8_input[buf_idx];
+#else
+    in_block = texelFetch(
+        t_packed_int8_input,
+        ivec3(block_elem_idx.x4, block_elem_idx.y, block_elem_idx.z4),
+        0);
+#endif
+  }
+
+  int current_row = 0;
+  int r_limit = min(4 - block_elem_idx.row, 4);
+  for (int r = 0; r < r_limit; r++) {
+    im2col_block[current_row++] = in_block[r + block_elem_idx.row];
+  }
+
+  in_block = input_zps;
+  block_elem_idx.x4++;
+#ifdef PACKED_INT8_INPUT_BUFFER
+  buf_idx += block_extents.data.z;
+#endif
+
+  if (is_block_elem_idx_in_bounds(block_elem_idx, block_extents)) {
+#ifdef PACKED_INT8_INPUT_BUFFER
+    in_block = t_packed_int8_input[buf_idx];
+#else
+    in_block = texelFetch(
+        t_packed_int8_input,
+        ivec3(block_elem_idx.x4, block_elem_idx.y, block_elem_idx.z4),
+        0);
+#endif
+  }
+
+  for (int r = 0; current_row < 4; ++r) {
+    im2col_block[current_row++] = in_block[r];
+  }
+
+  return im2col_block;
+}
+
+ivec4 load_im2col_block_no_alignment(
+    const Im2ColBlockLoadIndices load_ixs,
+    const Conv2dBlockExtents block_extents,
+    const int input_zp) {
+  ivec4 im2col_block;
+
+  for (int r = 0; r < 4; r++) {
+    const int im2col_w = load_ixs.im2col_w_start + r;
+    ivec4 row_values;
+    for (int c = 0; c < 4; c++) {
+      const int k_in_group = load_ixs.k_in_group_start + c;
+
+      if (k_in_group >= conv2d_params.logical_K_per_group) {
+        row_values[c] = input_zp;
+        continue;
+      }
+
+      Conv2dBlockElementIndex block_idx = get_packed_int8_input_element_idx(
+          im2col_w, load_ixs.im2col_h, k_in_group, load_ixs.group_idx);
+
+      row_values[c] =
+          load_packed_int8_input_element(block_idx, block_extents, input_zp);
+    }
+
+    im2col_block[r] = pack_into_int32(row_values);
+  }
+  return im2col_block;
+}
+
+ivec4 load_im2col_block(
+    const Im2ColBlockLoadIndices load_ixs,
+    const Conv2dBlockExtents block_extents,
+    const int input_zp,
+    const ivec4 input_zps) {
+  if (load_ixs.cols_aligned && load_ixs.rows_contiguous) {
+    return load_im2col_block_c_aligned_w_contiguous(
+        load_ixs, block_extents, input_zps);
+  }
+  return load_im2col_block_no_alignment(load_ixs, block_extents, input_zp);
+}
+
+#ifdef DEBUG_MODE
+
+void printLoadIndices(const Im2ColBlockLoadIndices load_ixs) {
+  debugPrintfEXT("LoadIndices: \\n");
+
+  if (load_ixs.block_aligned) {
+    debugPrintfEXT("  block_aligned \\n");
+  }
+  if (load_ixs.cols_aligned) {
+    debugPrintfEXT("  cols_aligned \\n");
+  }
+  if (load_ixs.rows_contiguous) {
+    debugPrintfEXT("  rows_contiguous \\n");
+  }
+
+  debugPrintfEXT(
+      "  block_idx_start: %d %d %d || %d %d \\n",
+      load_ixs.block_idx_start.x4,
+      load_ixs.block_idx_start.y,
+      load_ixs.block_idx_start.z4,
+      load_ixs.block_idx_start.row,
+      load_ixs.block_idx_start.col);
+}
+
+#endif
+
+#endif // IM2COL_PACKED_INT8_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh
index da326b26e93..c95abdcb230 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh
@@ -16,19 +16,6 @@
 
 #include "common.glslh"
 
-int sign_extend_8bit(const int val) {
-  if ((val & 0x80) != 0) {
-    return val | (~0xFF);
-  }
-  return val;
-}
-
-int extract_8bit_from_packed_int_le(const int packed, const int i) {
-  // account for little endian
-  int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
-  return byte;
-}
-
 // Extract a 4-bit value from a packed int (little endian)
 // It is assumed that the 4-bit value is in the range [0, 15]
 int extract_4bit_from_packed_int_le(const int packed, const int col) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh
index ca25e406ac1..850dc7943c0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh
@@ -75,7 +75,7 @@ void accumulate_out_tile_with_int_accum(
           input_zp_vec * weight_sums.data[n4] + accum.data[m][n4];
       out_tile.data[m][n4] =
           fma(VEC4_T(accum_adjusted),
-              VEC4_T(input_q_scale * weight_scales.data[0]),
+              VEC4_T(input_q_scale * weight_scales.data[n4]),
               out_tile.data[m][n4]);
     }
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh
index a6dbd7e78a2..8f19418cd19 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh
@@ -43,13 +43,6 @@ ivec4 quantize(
   return clamp(ivec4(quantized), -128, 127);
 }
 
-int pack_into_int32(const ivec4 quant_vals) {
-  int packed = ((quant_vals[0] & 0xFF) << 0) | ((quant_vals[1] & 0xFF) << 8) |
-      ((quant_vals[2] & 0xFF) << 16) | ((quant_vals[3] & 0xFF) << 24);
-
-  return packed;
-}
-
 void quantize_and_pack(
     out Int8InputBlock packed,
     const FPInputTile in_block,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh
new file mode 100644
index 00000000000..14aa6558bfc
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile.glslh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Macro Settings:
+ * - TILE_M
+ * - TILE_N4
+ */
+
+#ifndef LINEAR_INT8_OUTPUT_TILE_GLSLH
+#define LINEAR_INT8_OUTPUT_TILE_GLSLH
+
+#extension GL_EXT_control_flow_attributes : require
+
+struct Int8OutTile {
+  ivec4 data[TILE_M4][TILE_N4];
+};
+
+void initialize(out Int8OutTile tile) {
+  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      tile.data[m4][n4] = ivec4(0);
+    }
+  }
+}
+
+#ifdef DEBUG_MODE
+
+#include "linear_common.glslh"
+
+void printInt8OutTile(const Int8OutTile tile) {
+  debugPrintfEXT(
+      "Int8InputTile [TILE_M4=%d][TILE_N4=%d]:\\n", TILE_M4, TILE_N4);
+
+  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      debugPrintfEXT("  tile[%d][%d] (ivec4): ", m4, n4);
+
+      // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit
+      // values
+      [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {
+        int packed_int = tile.data[m4][n4][vec_idx];
+        debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);
+
+        // Extract 4 8-bit values from this packed integer
+        [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {
+          int val = extract_8bit_from_packed_int_le(packed_int, byte_idx);
+          if (byte_idx < 3) {
+            debugPrintfEXT("%d, ", val);
+          } else {
+            debugPrintfEXT("%d] ", val);
+          }
+        }
+      }
+      debugPrintfEXT("\\n");
+    }
+  }
+}
+
+#endif // DEBUG_MODE
+
+#endif // LINEAR_INT8_OUTPUT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile_compute.glslh
new file mode 100644
index 00000000000..1251ca60b87
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_output_tile_compute.glslh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Defines functions to compute a FPOutTile using int8 input and weight tiles.
+ *
+ * Settings:
+ * - TILE_M: The number of rows in the output tile.
+ * - TILE_N4: The number of (groups of 4) columns in the output tile.
+ */
+
+#ifndef LINEAR_INT8_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH
+#define LINEAR_INT8_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH
+
+#extension GL_EXT_control_flow_attributes : require
+#extension GL_EXT_integer_dot_product : require
+
+#include "linear_fp_per_out_channel_params.glslh"
+#include "linear_int8_output_tile.glslh"
+#include "linear_int_accumulator.glslh"
+#include "linear_int_per_out_channel_params.glslh"
+
+void compute_int8_out_tile_with_int32_accum(
+    out Int8OutTile out_tile,
+    const Int32Accum accum,
+    const float input_q_scale,
+    const int input_q_zp,
+    const float output_q_inv_scale,
+    const int output_q_zp,
+    const IntPerOutChannelParams weight_sums,
+    const FPPerOutChannelParams weight_scales) {
+  ivec4 input_zp_vec = ivec4(-input_q_zp);
+  ivec4 output_zp_vec = ivec4(-output_q_zp);
+  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
+    [[unroll]] for (int m4i = 0; m4i < 4; ++m4i) {
+      [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+        const int m = mul_4(m4) + m4i;
+        // Compute floating point output values
+        ivec4 accum_adjusted =
+            input_zp_vec * weight_sums.data[n4] + accum.data[m][n4];
+        vec4 float_out_texel =
+            vec4(accum_adjusted) * vec4(weight_scales.data[n4] * input_q_scale);
+        // Requantize to int8
+        float_out_texel =
+            round(float_out_texel * output_q_inv_scale) + output_q_zp;
+        ivec4 quantized_out_texel = clamp(ivec4(float_out_texel), -128, 127);
+
+        out_tile.data[m4][n4][m4i] = pack_into_int32(quantized_out_texel);
+      }
+    }
+  }
+}
+
+void compute_int8_out_tile_with_int32_accum(
+    out Int8OutTile out_tile,
+    const Int32Accum accum,
+    const float input_q_scale,
+    const int input_q_zp,
+    const float output_q_inv_scale,
+    const int output_q_zp,
+    const IntPerOutChannelParams weight_sums,
+    const FPPerOutChannelParams weight_scales,
+    const FPPerOutChannelParams bias) {
+  ivec4 input_zp_vec = ivec4(-input_q_zp);
+  ivec4 output_zp_vec = ivec4(-output_q_zp);
+  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
+    [[unroll]] for (int m4i = 0; m4i < 4; ++m4i) {
+      [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+        const int m = mul_4(m4) + m4i;
+        // Compute floating point output values
+        ivec4 accum_adjusted =
+            input_zp_vec * weight_sums.data[n4] + accum.data[m][n4];
+        vec4 float_out_texel =
+            fma(vec4(accum_adjusted),
+                vec4(weight_scales.data[n4]) * input_q_scale,
+                vec4(bias.data[n4]));
+        // Requantize to int8
+        float_out_texel =
+            round(float_out_texel * output_q_inv_scale) + output_q_zp;
+        ivec4 quantized_out_texel = clamp(ivec4(float_out_texel), -128, 127);
+
+        out_tile.data[m4][n4][m4i] = pack_into_int32(quantized_out_texel);
+      }
+    }
+  }
+}
+
+#endif // LINEAR_INT8_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
index 0ad91643219..878821d4189 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
@@ -76,9 +76,6 @@ void main() {
   const int N4 = div_up_4(output_sizes.x); // number of texels in each row
   const int N8 = div_up_8(output_sizes.x); // number of texels in each row
 
-  bool should_print = (n8 == 0) && (m4 == 0);
-  should_print = false;
-
   // VEC4_T out_texels[4][2];
   FPOutTile out_tile;
   initialize(out_tile);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml
index aa1de3077fc..989729f2d7f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml
@@ -11,7 +11,7 @@ linear_q8ta_q8csw_tiled:
     PACKED_INT8_INPUT_STORAGE: buffer
     WEIGHT_STORAGE: texture2d
     TILE_M4: 1
-    TILE_N4: 1
+    TILE_N4: 2
     TILE_K4: 1
   generate_variant_forall:
     DTYPE:
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.glsl
new file mode 100644
index 00000000000..da4162b6e58
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.glsl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_packed_int8_weight", "int", STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_int8_weight", "int", "buffer")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 qmat2_sizes;
+  ivec3 orig_sizes; // [K_h, aligned_K_w, OC]
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "common.glslh"
+
+void main() {
+  // The size of the source weight tensor is [K_h, aligned_K_w, OC] for depthwise conv.
+  // Each shader invocation processes a 4x4 block of weights for a group of output channels.
+  const int oc4 = int(gl_GlobalInvocationID.x);
+  const int k4 = int(gl_GlobalInvocationID.y);
+  const int k = mul_4(k4);
+
+  const int H = orig_sizes.x;
+  const int orig_W = orig_sizes.y;
+  const int W4 = div_up_4(orig_W);
+  const int OC = orig_sizes.z;
+
+  const int h = k4 / W4;
+  const int w4 = k4 % W4;
+  const int w = mul_4(w4);
+
+  // Determine the total number of blocks and check bounds
+  const int OC4 = div_up_4(OC);
+  const int K4 = H * W4;
+
+  if (oc4 >= OC4 || k4 >= K4) {
+    return;
+  }
+
+  ivec4 packed_block;
+
+  int buf_idx = (h * orig_W + w) * OC4 + oc4;
+  int r_limit = min(4, orig_W - w);
+  [[unroll]] for (int r = 0; r < r_limit; r++) {
+    packed_block[r] = t_int8_weight[buf_idx];
+    buf_idx += OC4;
+  }
+  [[unroll]] for (int r = r_limit; r < 4; r++) {
+    packed_block[r] = 0;
+  }
+
+#ifdef USING_BUFFER
+  t_packed_int8_weight[k4 * OC4 + oc4] = packed_block;
+#else
+  imageStore(t_packed_int8_weight, ivec2(oc4, k4), packed_block);
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.yaml
new file mode 100644
index 00000000000..9cfa3108ff0
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_dw_weights.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pack_q8_conv2d_dw_weights:
+  parameter_names_with_default_values:
+    STORAGE: buffer
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture2d
+  shader_variants:
+    - NAME: pack_q8_conv2d_dw_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.glsl
new file mode 100644
index 00000000000..e9982a8273d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.glsl
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+${define_required_extensions("int8")}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "t_packed_int8_weight", "int", STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_int8_weight", "int8", "buffer")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 qmat2_sizes;
+  ivec4 orig_sizes; // [OC, K_h, K_w, IC]
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "common.glslh"
+
+void main() {
+  const int block_x = int(gl_GlobalInvocationID.x);
+  const int block_y = int(gl_GlobalInvocationID.y);
+
+  const int kx = block_x % orig_sizes.z;
+  const int oc4 = block_x / orig_sizes.z;
+
+  const int OC4 = div_up_4(orig_sizes.x);
+  const int IC4 = div_up_4(orig_sizes.w);
+
+  const int nblocks_x = orig_sizes.z * OC4;
+  const int nblocks_y = IC4 * orig_sizes.y;
+
+  const int ic4 = block_y % IC4;
+  const int ky = block_y / IC4;
+
+  if (block_x >= nblocks_x || block_y >= nblocks_y) {
+    return;
+  }
+
+  const int oc = mul_4(oc4);
+  const int ic = mul_4(ic4);
+
+  const int oc_stride = align_up_4(orig_sizes.y * orig_sizes.z * orig_sizes.w);
+  const int oc_offset = oc * oc_stride;
+  const int ky_offset = ky * (orig_sizes.z * orig_sizes.w);
+  const int kx_offset = kx * orig_sizes.w;
+  int buf_idx = oc_offset + ky_offset + kx_offset + ic;
+
+  ivec4 packed_block = ivec4(0);
+  for (int row = 0; row < 4; row++) {
+    if (oc + row < orig_sizes.x) {
+      ivec4 weight_vals = ivec4(0);
+      for (int col = 0; col < 4; col++) {
+        if (ic + col < orig_sizes.w) {
+          weight_vals[col] = int(t_int8_weight[buf_idx + col]);
+        }
+      }
+      packed_block[row] = pack_into_int32(weight_vals);
+    }
+    buf_idx += oc_stride;
+  }
+
+#ifdef USING_BUFFER
+  const int out_buf_idx = block_y * (nblocks_x) + block_x;
+  t_packed_int8_weight[out_buf_idx] = packed_block;
+#else
+  imageStore(t_packed_int8_weight, ivec2(block_x, block_y), packed_block);
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.yaml
new file mode 100644
index 00000000000..9331de6e758
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_conv2d_weights.yaml
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+pack_q8_conv2d_weights:
+  parameter_names_with_default_values:
+    STORAGE: buffer
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: buffer
+      - VALUE: texture2d
+  shader_variants:
+    - NAME: pack_q8_conv2d_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl
new file mode 100644
index 00000000000..d485523709b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.glsl
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
+#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
+
+// corresponds to the input width dim
+#define TILE_M4 1
+// corresponds to the input channels dim
+#define TILE_K4 1
+
+#define TILE_M 4
+
+$if OUTPUT_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+$if INPUT_STORAGE == "buffer":
+  #define INPUT_BUFFER
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "input_sizes")}
+
+layout(push_constant) uniform restrict Block {
+  float inv_scale;
+  int zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "conv2d_fp_input_tile_load.glslh"
+#include "linear_int8_input_block.glslh"
+
+void store_packed_int8_block(
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents,
+    const Int8InputBlock packed_int8_block) {
+#ifdef OUTPUT_BUFFER
+  const int buffer_idx = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z + block_idx.data.z;
+  t_packed_int8_input[buffer_idx] = packed_int8_block.data;
+#else
+  imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data);
+#endif
+}
+
+void main() {
+  Conv2dBlockIndex block_idx;
+  block_idx.data = ivec3(gl_GlobalInvocationID);
+
+  Conv2dBlockExtents block_extents = make_block_extents(input_sizes);
+  if (block_idx_out_of_bounds(block_idx, block_extents)) {
+    return;
+  }
+
+  FPInputTile fp_tile;
+  load_fp_input_tile(fp_tile, block_idx);
+
+  Int8InputBlock int8_block;
+  quantize_and_pack(int8_block, fp_tile, inv_scale, zp);
+
+  store_packed_int8_block(block_idx, block_extents, int8_block);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml
new file mode 100644
index 00000000000..712d3156e2e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_q8ta_conv2d_input.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+quantize_and_pack_q8ta_conv2d_input:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUTPUT_STORAGE: texture3d
+    INPUT_STORAGE: texture3d
+  generate_variant_forall:
+    combination:
+      parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [buffer, texture3d]
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: quantize_and_pack_q8ta_conv2d_input
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh
index 03132db1348..1880397181d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/sdpa_fp_k_cache_tile_load.glslh
@@ -44,7 +44,6 @@ void load_k_cache_tile_no_checks(
     const int context_len,
     const int C,
     const int KV_H) {
-  bool should_print = d4_start == 0 && c_start == 0 && kv_h == 0;
   [[unroll]] for (int c = 0; c < TILE_N; ++c) {
     const int c4 = div_4(c);
     const int c4i = mod_4(c);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
index d35492bc367..86a2229c416 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
@@ -42,7 +42,8 @@ layout(constant_id = 5) const int group_dim = 1;
 // work group will write into its assigned element in the shared array.
 #define MAX_NTHREADS 16
 
-shared vec4 shared_vecs[MAX_NTHREADS];
+shared vec4 shared_max[MAX_NTHREADS];
+shared vec4 shared_sum[MAX_NTHREADS];
 
 #include "indexing_utils.h"
 
@@ -102,13 +103,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     max_elements = max(max_elements, load_texel(tin, scan_pos));
   }
-  shared_vecs[smi] = max_elements;
+  shared_max[smi] = max_elements;
   barrier();
   // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
+  max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
+    max_elements = max(max_elements, shared_max[group_i]);
   }
 
   scan_pos[reduce_dim] = tid.x;
@@ -118,13 +119,13 @@ void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
        i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
     denominators += exp(load_texel(tin, scan_pos) - max_elements);
   }
-  shared_vecs[smi] = denominators;
+  shared_sum[smi] = denominators;
   barrier();
   // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
+  denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
+    denominators += shared_sum[group_i];
   }
 
   // Determine if there are any padding elements in the final texel of the
@@ -184,13 +185,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
       max_elements.x = max(intex[i], max_elements.x);
     }
   }
-  shared_vecs[smi] = max_elements;
+  shared_max[smi] = max_elements;
   barrier();
   // Iterate over the partial maximums to obtain the overall maximum
   group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
+  max_elements = shared_max[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
+    max_elements = max(max_elements, shared_max[group_i]);
   }
   // Each element of the texel is itself a partial maximum; iterate over the
   // texel to find the actual maximum
@@ -214,13 +215,13 @@ void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
       denominators.x += exp(intex[i] - max_element);
     }
   }
-  shared_vecs[smi] = denominators;
+  shared_sum[smi] = denominators;
   barrier();
   // Iterate over the partial sums to obtain the overall sum
   group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
+  denominators = shared_sum[group_i++];
   for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
+    denominators += shared_sum[group_i];
   }
   // Reduce over the accumulated texel to find the overall sum
   float denominator = 0;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl
new file mode 100644
index 00000000000..798366b523a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.glsl
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
+#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
+
+// corresponds to the output width dim
+#define TILE_M4 1
+// corresponds to the output channels dim
+#define TILE_K4 1
+
+#define TILE_M 4
+
+$if OUTPUT_STORAGE == "buffer":
+  #define OUTPUT_BUFFER
+$if INPUT_STORAGE == "buffer":
+  #define INPUT_BUFFER
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "conv2d_common.glslh"
+
+${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_packed_int8_output", "int", INPUT_STORAGE, is_scalar_array=False)}
+
+${layout_declare_ubo(B, "ivec4", "output_sizes")}
+
+layout(push_constant) uniform restrict Block {
+  float scale;
+  int zp;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "linear_fp_input_tile.glslh"
+#include "linear_int8_input_tile.glslh"
+
+void load_packed_int8_tile(
+    out Int8InputTile int8_tile,
+    const Conv2dBlockIndex block_idx,
+    const Conv2dBlockExtents block_extents) {
+#ifdef INPUT_BUFFER
+  const int buffer_idx = block_idx.data.y * block_extents.data_xz +
+      block_idx.data.x * block_extents.data.z + block_idx.data.z;
+  int8_tile.data[0][0] = t_packed_int8_output[buffer_idx];
+#else
+  int8_tile.data[0][0] = texelFetch(t_packed_int8_output, block_idx.data, 0);
+#endif
+}
+
+VEC4_T
+dequantize_8bit(const ivec4 val, const float q_scale, const int q_zero_point) {
+  return VEC4_T(val - q_zero_point) * q_scale;
+}
+
+void unpack_and_dequantize(
+    out FPInputTile fp_tile,
+    const Int8InputTile int8_tile,
+    const float q_scale,
+    const int q_zero_point) {
+  [[unroll]] for (int w = 0; w < 4; ++w) {
+    int packed = int8_tile.data[0][0][w];
+    fp_tile.data[w][0] = dequantize_8bit(
+        ivec4(
+            extract_8bit_from_packed_int_le(packed, 0),
+            extract_8bit_from_packed_int_le(packed, 1),
+            extract_8bit_from_packed_int_le(packed, 2),
+            extract_8bit_from_packed_int_le(packed, 3)),
+        q_scale,
+        q_zero_point);
+  }
+}
+
+void store_fp_output_texel(
+    const Conv2dTensorIndex tidx,
+    const VEC4_T out_texel) {
+  imageStore(t_fp_output, tidx.data, out_texel);
+}
+
+void store_fp_tile(
+    const FPInputTile block,
+    const Conv2dBlockIndex block_idx) {
+  Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx);
+  [[unroll]] for (int w = 0; w < 4; w++) {
+    store_fp_output_texel(store_tidx, block.data[w][0]);
+    store_tidx.data.x++;
+  }
+}
+
+void main() {
+  Conv2dBlockIndex block_idx;
+  block_idx.data = ivec3(gl_GlobalInvocationID);
+
+  Conv2dBlockExtents block_extents = make_block_extents(output_sizes);
+  if (block_idx_out_of_bounds(block_idx, block_extents)) {
+    return;
+  }
+
+  Int8InputTile int8_tile;
+  load_packed_int8_tile(int8_tile, block_idx, block_extents);
+
+  FPInputTile fp_tile;
+  unpack_and_dequantize(
+      fp_tile, int8_tile, scale, zp);
+
+  store_fp_tile(fp_tile, block_idx);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml
new file mode 100644
index 00000000000..24b253da343
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/unpack_and_dequantize_q8ta_conv2d_output.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+unpack_and_dequantize_q8ta_conv2d_output:
+  parameter_names_with_default_values:
+    DTYPE: float
+    OUTPUT_STORAGE: texture3d
+    INPUT_STORAGE: texture3d
+  generate_variant_forall:
+    combination:
+      parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
+      combos:
+        - parameter_values: [texture3d, texture3d]
+        - parameter_values: [texture3d, buffer]
+    DTYPE:
+      - VALUE: float
+  shader_variants:
+    - NAME: unpack_and_dequantize_q8ta_conv2d_output
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
index 757afd06849..a6dd8f07f53 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
@@ -19,6 +19,18 @@
 
 namespace vkcompute {
 
+void resize_batch_norm_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  // For batch norm, output dimensions are the same as input dimensions
+  std::vector<int64_t> new_out_sizes = graph->sizes_of(self);
+  graph->virtual_resize(out, new_out_sizes);
+}
+
 ValueRef check_and_prepack_arg(
     ComputeGraph& graph,
     ValueRef arg_ref,
@@ -101,7 +113,7 @@ void add_native_batch_norm_node(
       // Resize Args
       {},
       // Resizing Logic
-      nullptr));
+      resize_batch_norm_node));
 }
 
 void native_batch_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
index 6c701224f7f..71690ffc604 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
@@ -56,4 +56,27 @@ utils::uvec3 pick_hw_square_wg_size(
   return {16u, 4u, 1u};
 }
 
+utils::uvec3 pick_wc_square_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)args;
+  (void)resize_args;
+  // Some inactive invocations are okay; set 6 as the threshold to use the
+  // a square wg size.
+  if (global_workgroup_size[0u] >= 6 && global_workgroup_size[2u] >= 6) {
+    return {8u, 1u, 8u};
+  }
+  // If channels dim is sufficiently small, then bias towards width dim to
+  // reduce the number of inactive invocations.
+  if (global_workgroup_size[2u] < 2u) {
+    return {64u, 1u, 1u};
+  }
+  return {16u, 1u, 4u};
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h
index 1831ab2a845..b412f737c13 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.h
@@ -54,4 +54,11 @@ utils::uvec3 pick_hw_square_wg_size(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& resize_args);
 
+utils::uvec3 pick_wc_square_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index b83164f27d2..479bb44ae6f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -365,6 +365,10 @@ utils::uvec3 conv2d_global_wg_size(
 
   if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
+
+    if (shader.kernel_name.find("s1p0") != std::string::npos) {
+      wg_size[0] *= 4;
+    }
   }
 
   return wg_size;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index 9ac4c963bc3..329620e80e6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -109,11 +109,15 @@ void add_permute_node(
   {
     IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
     const int32_t permute_ndim =
-        utils::safe_downcast<int>(permute_dims_ptr->size());
+        utils::safe_downcast<int32_t>(permute_dims_ptr->size());
 
     for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
          nchw_i--, whcn_i++) {
-      const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i);
+      int32_t permute_dim_nchw =
+          utils::safe_downcast<int32_t>(permute_dims_ptr->at(nchw_i));
+      if (permute_dim_nchw < 0) {
+        permute_dim_nchw += permute_ndim;
+      }
       const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
 
       whcn_permute_dims[whcn_i] = permute_dim_whcn;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
index 250fcdd5490..879f59667d6 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
@@ -137,7 +137,7 @@ void max_pool2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 
 struct DivisorParams final {
   int32_t divisor_override;
-  bool count_include_pad;
+  int32_t count_include_pad;
 };
 
 DivisorParams create_divisor_params(
@@ -148,7 +148,7 @@ DivisorParams create_divisor_params(
       graph.val_is_int(divisor_override)
           ? static_cast<int32_t>(graph.get_int(divisor_override))
           : 0,
-      graph.get_bool(count_include_pad)};
+      int32_t(graph.get_bool(count_include_pad))};
 }
 
 void add_avg_pool2d_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp
new file mode 100644
index 00000000000..4b359f12700
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedBinary.cpp
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Shader dispatch utilities
+//
+
+utils::uvec3 pick_q8ta_q8ta_q8to_binary_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef packed_int8_output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
+
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {W4 * H * C4, 1, 1};
+}
+
+//
+// Dispatch nodes
+//
+
+void add_q8ta_q8ta_q8to_binary_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_input_a,
+    const ValueRef packed_int8_input_b,
+    const ValueRef input_a_scale,
+    const ValueRef input_a_zp,
+    const ValueRef input_b_scale,
+    const ValueRef input_b_zp,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef alpha,
+    const ValueRef packed_int8_output,
+    const std::string& op_name) {
+  float input_a_scale_val = graph.extract_scalar<float>(input_a_scale);
+  int32_t input_a_zp_val = graph.extract_scalar<int32_t>(input_a_zp);
+  float input_b_scale_val = graph.extract_scalar<float>(input_b_scale);
+  int32_t input_b_zp_val = graph.extract_scalar<int32_t>(input_b_zp);
+
+  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
+  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
+
+  float alpha_val = 1.0f;
+  // String is checked since some ops pass in an unused string argument in
+  // place of alpha
+  if (is_valid(alpha) && !graph.val_is_string(alpha)) {
+    alpha_val = graph.extract_scalar<float>(alpha);
+  }
+
+  std::string kernel_name = op_name + "_q8ta_q8ta_q8to";
+  add_storage_type_suffix(
+      kernel_name, graph.storage_type_of(packed_int8_output));
+
+  vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(packed_int8_output)};
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&input_a_scale_val, sizeof(input_a_scale_val)),
+      PushConstantDataInfo(&input_a_zp_val, sizeof(input_a_zp_val)),
+      PushConstantDataInfo(&input_b_scale_val, sizeof(input_b_scale_val)),
+      PushConstantDataInfo(&input_b_zp_val, sizeof(input_b_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+      PushConstantDataInfo(&alpha_val, sizeof(alpha_val)),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_q8ta_q8ta_q8to_binary_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{packed_int8_output, vkapi::kWrite},
+       {{packed_int8_input_a, packed_int8_input_b}, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+//
+// High level operator impl
+//
+
+void add_q8ta_q8ta_q8to(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef packed_int8_input_a = args.at(idx++);
+  const ValueRef packed_int8_input_b = args.at(idx++);
+  const ValueRef input_a_scale = args.at(idx++);
+  const ValueRef input_a_zp = args.at(idx++);
+  const ValueRef input_b_scale = args.at(idx++);
+  const ValueRef input_b_zp = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef alpha = args.at(idx++);
+  const ValueRef packed_int8_output = args.at(idx++);
+
+  add_q8ta_q8ta_q8to_binary_node(
+      graph,
+      packed_int8_input_a,
+      packed_int8_input_b,
+      input_a_scale,
+      input_a_zp,
+      input_b_scale,
+      input_b_zp,
+      output_scale,
+      output_zp,
+      alpha,
+      packed_int8_output,
+      "add");
+}
+
+//
+// Test operators
+//
+
+void add_q8ta_q8ta_q8to_test(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input_a = args.at(idx++);
+  const ValueRef fp_input_b = args.at(idx++);
+  const ValueRef input_a_scale = args.at(idx++);
+  const ValueRef input_a_zp = args.at(idx++);
+  const ValueRef input_b_scale = args.at(idx++);
+  const ValueRef input_b_zp = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef alpha = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  TmpTensor packed_int8_input_a(
+      &graph,
+      graph.sizes_of(fp_input_a),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      utils::kPackedInt8_4W4C);
+
+  TmpTensor packed_int8_input_b(
+      &graph,
+      graph.sizes_of(fp_input_b),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      utils::kPackedInt8_4W4C);
+
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_output),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      utils::kPackedInt8_4W4C);
+
+  add_quantize_and_pack_q8ta_conv2d_input_node(
+      graph, fp_input_a, input_a_scale, input_a_zp, packed_int8_input_a);
+
+  add_quantize_and_pack_q8ta_conv2d_input_node(
+      graph, fp_input_b, input_b_scale, input_b_zp, packed_int8_input_b);
+
+  std::vector<ValueRef> add_args = {
+      packed_int8_input_a,
+      packed_int8_input_b,
+      input_a_scale,
+      input_a_zp,
+      input_b_scale,
+      input_b_zp,
+      output_scale,
+      output_zp,
+      alpha,
+      packed_int8_output};
+
+  add_q8ta_q8ta_q8to(graph, add_args);
+
+  add_unpack_and_dequantize_q8ta_conv2d_output_node(
+      graph, packed_int8_output, output_scale, output_zp, fp_output);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.default, add_q8ta_q8ta_q8to);
+  VK_REGISTER_OP(et_vk.add_q8ta_q8ta_q8to.test, add_q8ta_q8ta_q8to_test);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
index 51f8138485e..775e4534cfb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
@@ -9,6 +9,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
@@ -19,6 +20,86 @@ namespace vkcompute {
 // Utility functions
 //
 
+bool is_pointwise(ComputeGraph* graph, const ValueRef& kernel_size) {
+  const auto kernel_size_list = graph->get_int_list(kernel_size);
+  return kernel_size_list->at(0) == 1 && kernel_size_list->at(1) == 1;
+}
+
+bool is_s1p1d1(
+    ComputeGraph* graph,
+    const ValueRef& stride,
+    const ValueRef& padding,
+    const ValueRef& dilation) {
+  const auto stride_list = graph->get_int_list(stride);
+  const auto padding_list = graph->get_int_list(padding);
+  const auto dilation_list = graph->get_int_list(dilation);
+  if (stride_list->at(0) != 1 && stride_list->at(1) != 1) {
+    return false;
+  }
+  if (padding_list->at(0) != 1 && padding_list->at(1) != 1) {
+    return false;
+  }
+  if (dilation_list->at(0) != 1 && dilation_list->at(1) != 1) {
+    return false;
+  }
+  return true;
+}
+
+bool is_s1p0d1_pointwise(
+    ComputeGraph* graph,
+    const ValueRef& kernel_size,
+    const ValueRef& stride,
+    const ValueRef& padding,
+    const ValueRef& dilation) {
+  if (is_pointwise(graph, kernel_size)) {
+    const auto stride_list = graph->get_int_list(stride);
+    const auto padding_list = graph->get_int_list(padding);
+    const auto dilation_list = graph->get_int_list(dilation);
+    if (stride_list->at(0) != 1 && stride_list->at(1) != 1) {
+      return false;
+    }
+    if (padding_list->at(0) != 0 && padding_list->at(1) != 0) {
+      return false;
+    }
+    if (dilation_list->at(0) != 1 && dilation_list->at(1) != 1) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool should_use_im2col(
+    ComputeGraph* graph,
+    const ValueRef kernel_size,
+    const ValueRef groups) {
+  const auto kernel_size_list = graph->get_int_list(kernel_size);
+
+  // Always use im2col for pointwise convolutions
+  if (kernel_size_list->at(0) * kernel_size_list->at(1) == 1) {
+    return true;
+  }
+
+  // For large kernel sizes, the im2col matrix will be too big. Not only will
+  // this result in a larger footprint for the im2col matrix, but the cost of
+  // performing the im2col procedure will also become prohibitive. In these
+  // cases it is faster to just compute convolution directly without going
+  // through im2col. Empirically, im2col works well for 3x3 convolution and
+  // not for 5x5 convolution, so set the limit at 10.
+  if (kernel_size_list->at(0) * kernel_size_list->at(1) > 10) {
+    return false;
+  }
+
+  // Only use im2col for non-grouped convolutions; manual experimentation shows
+  // that im2col becomes very slow when dealing with grouped convolutions. The
+  // reason for this is likely that memory access in the im2col shader becomes
+  // too non-linear due to needed to keep convolution groups contiguous in
+  // in memory. This means that the channels of the input tensor (which are
+  // originally contiguous in memory) will be split up during the im2col
+  // procedure.
+  return graph->get_int(groups) == 1;
+}
+
 struct Conv2DParams {
   utils::ivec2 kernel_size;
   utils::ivec2 stride;
@@ -135,6 +216,43 @@ std::vector<int64_t> calculate_input_im2col_sizes(
   return {M, K};
 }
 
+std::vector<int64_t> calculate_packed_int8_input_im2col_sizes(
+    ComputeGraph* graph,
+    const ValueRef& input,
+    const ValueRef& output,
+    const ValueRef& kernel_size,
+    const ValueRef& groups) {
+  std::vector<int64_t> in_sizes = graph->sizes_of(input);
+  const int64_t in_channels = utils::val_at(-3, in_sizes);
+
+  std::vector<int64_t> out_sizes = graph->sizes_of(output);
+  const int64_t out_height = utils::val_at(-2, out_sizes);
+  const int64_t out_width = utils::val_at(-1, out_sizes);
+
+  // Represents the number of channel groups
+  const int64_t groups_val = graph->extract_scalar<int64_t>(groups);
+  // No need to div_up because in_channels % groups_val = 0
+  const int64_t in_channels_per_group = in_channels / groups_val;
+
+  const auto kernel_size_list = graph->get_int_list(kernel_size);
+
+  // Align to the next multiple of 4 to ensure that data loads align nicely with
+  // texel boundaries. We want to ensure that the first data element of each
+  // group is at the start of its texel.
+  const int64_t flattened_kernel_len = utils::align_up_4(
+      in_channels_per_group * kernel_size_list->at(0) *
+      kernel_size_list->at(1));
+
+  // K -> flattened convolution window (repeated for each group)
+  const int64_t K = flattened_kernel_len * groups_val;
+  // M -> number of elements in 2D output plane. This is aligned to the next
+  // multiple of 4 since the im2col shader operates on 4x4 blocks.
+  const int64_t W = utils::align_up_4(out_width);
+  const int64_t H = out_height;
+
+  return {K, H, W};
+}
+
 std::vector<int64_t> calculate_output_im2col_sizes(
     ComputeGraph* graph,
     const ValueRef& output) {
@@ -156,6 +274,40 @@ std::vector<int64_t> calculate_output_im2col_sizes(
 // Shader dispatch utilities
 //
 
+utils::uvec3 pick_quantize_and_pack_conv2d_input_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef fp_input = args.at(1).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, fp_input);
+  const uint32_t H = graph->size_at<uint32_t>(-2, fp_input);
+  const uint32_t C = graph->size_at<uint32_t>(-3, fp_input);
+
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {W4, H, C4};
+}
+
+utils::uvec3 pick_unpack_and_dequantize_conv2d_output_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef fp_output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, fp_output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, fp_output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, fp_output);
+
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {W4, H, C4};
+}
+
 utils::uvec3 im2col_global_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
@@ -178,6 +330,33 @@ utils::uvec3 im2col_global_wg_size(
   return {K4, M4, 1};
 }
 
+utils::uvec3 im2col_packed_int8_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef input_im2col = args.at(0).refs.at(0);
+
+  std::vector<int64_t> im2col_sizes = graph->sizes_of(input_im2col);
+  const uint32_t K = utils::safe_downcast<uint32_t>(im2col_sizes[0]);
+  const uint32_t H = utils::safe_downcast<uint32_t>(im2col_sizes[1]);
+  const uint32_t W = utils::safe_downcast<uint32_t>(im2col_sizes[2]);
+
+  const uint32_t K4 = utils::div_up(K, 4u);
+  const uint32_t W4 = utils::div_up(W, 4u);
+
+  return {K4 * W4 * H, 1, 1};
+}
+
+utils::uvec3 im2col_packed_int8_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  return {64, 1, 1};
+}
+
 utils::uvec3 col2im_global_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
@@ -197,6 +376,229 @@ utils::uvec3 col2im_global_wg_size(
   return {N4, M4, 1};
 }
 
+utils::uvec3 pick_static_quantized_conv2d_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef packed_int8_output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
+
+  uint32_t C_per_tile = 4;
+  uint32_t W_per_tile = 4;
+
+  if (shader.kernel_name.find("linear") != std::string::npos) {
+    C_per_tile = 8;
+  }
+
+  const uint32_t num_W_tiles = utils::div_up(W, W_per_tile);
+  const uint32_t num_C_tiles = utils::div_up(C, C_per_tile);
+
+  return {num_C_tiles, num_W_tiles, H};
+}
+
+utils::uvec3 pick_static_quantized_conv2d_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  return pick_hw_square_wg_size(
+      graph, shader, global_workgroup_size, args, resize_args);
+}
+
+utils::uvec3 int8_conv2d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef packed_int8_output = args.at(0).refs.at(0);
+
+  const uint32_t W = graph->size_at<uint32_t>(-1, packed_int8_output);
+  const uint32_t H = graph->size_at<uint32_t>(-2, packed_int8_output);
+  const uint32_t C = graph->size_at<uint32_t>(-3, packed_int8_output);
+
+  const uint32_t W4 = utils::div_up_4(W);
+  const uint32_t C4 = utils::div_up_4(C);
+
+  return {C4 * W4 * H, 1, 1};
+}
+
+//
+// Prepack nodes
+//
+
+ValueRef prepack_quantized_conv2d_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef input,
+    const ValueRef output,
+    const ValueRef groups,
+    const ValueRef kernel_size) {
+  VK_CHECK_COND(weight_quant_config.nbits == 8);
+  VK_CHECK_COND(weight_quant_config.is_symmetric);
+
+  const int32_t groups_val = graph.get_int(groups);
+
+  const int64_t OC = graph.size_at<int64_t>(-3, output);
+  const int64_t IC = graph.size_at<int64_t>(-3, input) / groups_val;
+
+  int64_t K_h;
+  int64_t K_w;
+
+  {
+    const auto kernel_size_list = graph.get_int_list(kernel_size);
+    K_h = kernel_size_list->at(0);
+    K_w = kernel_size_list->at(1);
+  }
+
+  const int64_t num_blocks_OC = utils::div_up_4(OC);
+  const int64_t num_blocks_IC = utils::div_up_4(IC);
+
+  const int64_t num_blocks_y = num_blocks_IC * K_h;
+  const int64_t num_blocks_x = K_w * num_blocks_OC;
+
+  // The packed tensor arranges blocks as [OC_blocks * K_total, IC_blocks]
+  const int64_t output_height = num_blocks_y;
+  const int64_t output_width = num_blocks_x * 4;
+
+  // Store the original sizes of the weight data to pass to the shader
+  utils::ivec4 orig_sizes = {
+      utils::safe_downcast<int32_t>(OC),
+      utils::safe_downcast<int32_t>(K_h),
+      utils::safe_downcast<int32_t>(K_w),
+      utils::safe_downcast<int32_t>(IC)};
+
+  std::vector<int64_t> packed_weight_sizes{output_height, output_width};
+
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width > max_extent * 4 || output_height > max_extent) {
+    storage_type = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      packed_weight_sizes,
+      vkcompute::vkapi::kInt,
+      storage_type,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(num_blocks_x),
+      utils::safe_downcast<uint32_t>(num_blocks_y),
+      1u};
+
+  std::string kernel_name = "pack_q8_conv2d_weights";
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      weight_data,
+      packed_weight,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(packed_weight),
+       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec4))}));
+
+  return packed_weight;
+}
+
+ValueRef prepack_quantized_conv2d_dw_weight(
+    ComputeGraph& graph,
+    const QuantizationConfig& weight_quant_config,
+    const ValueRef weight_data,
+    const ValueRef kernel_size) {
+  VK_CHECK_COND(weight_quant_config.nbits == 8);
+  VK_CHECK_COND(weight_quant_config.is_symmetric);
+
+  std::vector<int64_t> weight_orig_sizes = graph.sizes_of(weight_data);
+  const int64_t ndim = graph.dim_of(weight_data);
+
+  // For depthwise convolution, expect weight layout [K_h, aligned_K_w, OC]
+  VK_CHECK_COND(ndim == 3);
+  int64_t K_h = weight_orig_sizes.at(0);
+  int64_t K_w = weight_orig_sizes.at(1);
+  int64_t aligned_K_w = utils::align_up_4(K_w);
+  int64_t OC = weight_orig_sizes.at(2);
+
+  // The packing format packs the weight tensor into blocks of 4 output channels
+  // (OC) and 4 kernel elements (K_h * aligned_K_w)
+  int64_t OC_per_block = 4;
+  int64_t K_per_block = 4;
+
+  // To figure out the size of the output tensor, determine the number of blocks
+  // along each dimension.
+  const int64_t total_K_elements = K_h * aligned_K_w;
+  const int64_t num_blocks_K = utils::div_up(total_K_elements, K_per_block);
+  const int64_t num_blocks_OC = utils::div_up(OC, OC_per_block);
+
+  // The blocks are arranged in a transposed manner, such that the transposed
+  // weight block is indexed like packed_weights[k4][oc4] - this is to allow for
+  // optimal memory coalescing when computing the depthwise convolution.
+  int64_t output_height = num_blocks_K;
+  // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit
+  // values) and each block is represented as a ivec4. Therefore the width dim
+  // of the packed tensor is multiplied by 4.
+  int64_t output_width = num_blocks_OC * 4;
+
+  // Store the original sizes of the weight data to pass to the shader
+  utils::ivec3 orig_sizes = {
+      utils::safe_downcast<int32_t>(K_h),
+      utils::safe_downcast<int32_t>(K_w),
+      utils::safe_downcast<int32_t>(OC)};
+
+  std::vector<int64_t> packed_weight_sizes{output_height, output_width};
+
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width > max_extent * 4 || output_height > max_extent) {
+    storage_type = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      packed_weight_sizes,
+      vkcompute::vkapi::kInt,
+      storage_type,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(num_blocks_OC),
+      utils::safe_downcast<uint32_t>(num_blocks_K),
+      1u};
+
+  std::string kernel_name = "pack_q8_conv2d_dw_weights";
+  add_storage_type_suffix(kernel_name, storage_type);
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      // Inputs and Outputs
+      weight_data,
+      packed_weight,
+      // UBOs
+      {},
+      // Specialization Constants
+      {},
+      // Push Constants
+      {graph.sizes_pc_of(packed_weight),
+       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec3))}));
+
+  return packed_weight;
+}
+
 //
 // Dispatch nodes
 //
@@ -251,6 +653,145 @@ void add_input_im2col_node(
       nullptr));
 }
 
+void add_input_im2col_packed_int8_node(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef output,
+    const ValueRef input_im2col) {
+  Conv2DParams conv_params = create_conv2d_params(
+      graph, input, output, kernel_size, stride, padding, dilation, groups);
+
+  float inv_scale = 1.0f / graph.extract_scalar<float>(input_scale);
+  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
+
+  std::string kernel_name = "im2col_packed_int8";
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col));
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.sizes_ubo(input_im2col),
+      graph.sizes_ubo(output),
+      graph.sizes_ubo(input),
+      graph.create_params_buffer(conv_params)};
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&inv_scale, sizeof(inv_scale)),
+      PushConstantDataInfo(&zp, sizeof(zp)),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      im2col_packed_int8_global_wg_size,
+      im2col_packed_int8_local_wg_size,
+      // Inputs and Outputs
+      {{input_im2col, vkapi::kWrite}, {input, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+void add_quantize_and_pack_q8ta_conv2d_input_node(
+    ComputeGraph& graph,
+    const ValueRef fp_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_int8_input) {
+  float inv_scale = 1.0f / graph.extract_scalar<float>(input_scale);
+  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
+
+  // Get shader for quantized conv2d linear tiled
+  std::string kernel_name = "quantize_and_pack_q8ta_conv2d_input";
+  add_storage_type_suffix(
+      kernel_name, graph.storage_type_of(packed_int8_input));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(fp_input));
+  add_dtype_suffix(kernel_name, graph.dtype_of(fp_input));
+
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(fp_input)};
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&inv_scale, sizeof(inv_scale)),
+      PushConstantDataInfo(&zp, sizeof(zp)),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_quantize_and_pack_conv2d_input_global_wg_size,
+      pick_wc_square_wg_size,
+      // Inputs and Outputs
+      {{packed_int8_input, vkapi::kWrite}, {fp_input, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+void add_unpack_and_dequantize_q8ta_conv2d_output_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_output,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef fp_output) {
+  float scale = graph.extract_scalar<float>(output_scale);
+  int32_t zp = graph.extract_scalar<int32_t>(output_zp);
+
+  // Get shader for quantized conv2d linear tiled
+  std::string kernel_name = "unpack_and_dequantize_q8ta_conv2d_output";
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(fp_output));
+  add_storage_type_suffix(
+      kernel_name, graph.storage_type_of(packed_int8_output));
+  add_dtype_suffix(kernel_name, graph.dtype_of(fp_output));
+
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(fp_output)};
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&scale, sizeof(scale)),
+      PushConstantDataInfo(&zp, sizeof(zp)),
+  };
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      pick_unpack_and_dequantize_conv2d_output_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{fp_output, vkapi::kWrite}, {packed_int8_output, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
 void add_quantize_and_pack_im2col_node(
     ComputeGraph& graph,
     const ValueRef input_image,
@@ -307,19 +848,178 @@ void add_quantize_and_pack_im2col_node(
       // Push Constants
       push_constants,
       // Specialization Constants
-      {},
+      {},
+      // Resize args
+      {output_image, kernel_size, groups},
+      // Resizing Logic
+      nullptr));
+}
+
+void add_conv2d_q8csw_linear_node(
+    ComputeGraph& graph,
+    const ValueRef input_im2col,
+    const ValueRef input_image,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_scales,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef output_image) {
+  Conv2DParams conv_params = create_conv2d_params(
+      graph,
+      input_image,
+      output_image,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  // One limitation of the current implementation is that for grouped convs,
+  // the number of output_image channels per group must be a multiple of 4. One
+  // loaded 4x4 weight tile must all belong to the same group.
+  if (conv_params.groups > 1) {
+    VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0);
+  }
+
+  std::string kernel_name = "conv2d_q8csw_linear_tiled";
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
+  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.sizes_ubo(output_image),
+      graph.sizes_ubo(input_image),
+      graph.create_params_buffer(conv_params)};
+
+  uint32_t apply_bias = 1;
+  if (graph.val_is_none(bias_data)) {
+    apply_bias = 0;
+  }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      col2im_global_wg_size,
+      quantized_linear_local_wg_size,
+      // Inputs and Outputs
+      {{output_image, vkapi::kWrite},
+       {{input_im2col, packed_weight, packed_weight_scales, packed_bias},
+        vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      {},
+      // Specialization Constants
+      {apply_bias},
+      // Resize args
+      {},
+      // Resizing Logic
+      nullptr));
+}
+
+void add_conv2d_q8ta_q8csw_linear_node(
+    ComputeGraph& graph,
+    const ValueRef input_int_im2col,
+    const ValueRef input_image,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef weight_data,
+    const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
+    const ValueRef packed_weight_scales,
+    const ValueRef bias_data,
+    const ValueRef packed_bias,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef output_image) {
+  Conv2DParams conv_params = create_conv2d_params(
+      graph,
+      input_image,
+      output_image,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups);
+
+  // One limitation of the current implementation is that for grouped convs,
+  // the number of output channels per group must be a multiple of 4. One loaded
+  // 4x4 weight tile must all belong to the same group.
+  if (conv_params.groups > 1) {
+    VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0);
+  }
+
+  float scale = graph.extract_scalar<float>(input_scale);
+  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
+
+  std::string kernel_name = "conv2d_q8ta_q8csw_linear_tiled";
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col));
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
+  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.sizes_ubo(output_image),
+      graph.sizes_ubo(input_image),
+      graph.create_params_buffer(conv_params)};
+
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&scale, sizeof(scale)),
+      PushConstantDataInfo(&zp, sizeof(zp)),
+  };
+
+  uint32_t apply_bias = 1;
+  if (graph.val_is_none(bias_data)) {
+    apply_bias = 0;
+  }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      col2im_global_wg_size,
+      quantized_linear_local_wg_size,
+      // Inputs and Outputs
+      {{output_image, vkapi::kWrite},
+       {{input_int_im2col,
+         packed_weight,
+         packed_weight_sums,
+         packed_weight_scales,
+         packed_bias},
+        vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {apply_bias},
       // Resize args
-      {output_image, kernel_size, groups},
+      {weight_data},
       // Resizing Logic
       nullptr));
 }
 
-void add_conv2d_q8csw_linear_node(
+void add_conv2d_q8ta_q8csw_q8to_node(
     ComputeGraph& graph,
-    const ValueRef input_im2col,
-    const ValueRef input_image,
+    const ValueRef packed_int8_input,
+    const ValueRef packed_int8_input_im2col,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
     const ValueRef packed_weight,
+    const ValueRef packed_weight_sums,
     const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
     const ValueRef bias_data,
     const ValueRef packed_bias,
     const ValueRef kernel_size,
@@ -327,36 +1027,45 @@ void add_conv2d_q8csw_linear_node(
     const ValueRef padding,
     const ValueRef dilation,
     const ValueRef groups,
-    const ValueRef output_image) {
+    const ValueRef packed_int8_output) {
   Conv2DParams conv_params = create_conv2d_params(
       graph,
-      input_image,
-      output_image,
+      packed_int8_input,
+      packed_int8_output,
       kernel_size,
       stride,
       padding,
       dilation,
       groups);
 
-  // One limitation of the current implementation is that for grouped convs,
-  // the number of output_image channels per group must be a multiple of 4. One
-  // loaded 4x4 weight tile must all belong to the same group.
-  if (conv_params.groups > 1) {
-    VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0);
-  }
+  const bool use_im2col = should_use_im2col(&graph, kernel_size, groups);
 
-  std::string kernel_name = "conv2d_q8csw_linear_tiled";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col));
+  float input_scale_val = graph.extract_scalar<float>(input_scale);
+  int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
+
+  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
+  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
+
+  std::string kernel_name = use_im2col ? "conv2d_q8ta_q8csw_q8to_linear_tiled"
+                                       : "conv2d_q8ta_q8csw_q8to";
+  add_storage_type_suffix(
+      kernel_name, graph.storage_type_of(packed_int8_output));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
+  add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
   vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(output_image),
-      graph.sizes_ubo(input_image),
+      graph.sizes_ubo(packed_int8_output),
+      graph.sizes_ubo(packed_int8_input_im2col),
       graph.create_params_buffer(conv_params)};
 
+  std::vector<PushConstantDataInfo> push_constants = {
+      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
+      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
+  };
+
   uint32_t apply_bias = 1;
   if (graph.val_is_none(bias_data)) {
     apply_bias = 0;
@@ -365,16 +1074,20 @@ void add_conv2d_q8csw_linear_node(
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      col2im_global_wg_size,
-      quantized_linear_local_wg_size,
+      pick_static_quantized_conv2d_global_wg_size,
+      pick_static_quantized_conv2d_local_wg_size,
       // Inputs and Outputs
-      {{output_image, vkapi::kWrite},
-       {{input_im2col, packed_weight, packed_weight_scales, packed_bias},
+      {{packed_int8_output, vkapi::kWrite},
+       {{packed_int8_input_im2col,
+         packed_weight,
+         packed_weight_sums,
+         packed_weight_scales,
+         packed_bias},
         vkapi::kRead}},
       // Shader params buffers
       param_buffers,
       // Push Constants
-      {},
+      push_constants,
       // Specialization Constants
       {apply_bias},
       // Resize args
@@ -383,16 +1096,16 @@ void add_conv2d_q8csw_linear_node(
       nullptr));
 }
 
-void add_conv2d_q8ta_q8csw_linear_node(
+void add_conv2d_dw_q8ta_q8csw_q8to_node(
     ComputeGraph& graph,
-    const ValueRef input_int_im2col,
-    const ValueRef input_image,
+    const ValueRef packed_int8_input,
     const ValueRef input_scale,
     const ValueRef input_zp,
-    const ValueRef weight_data,
     const ValueRef packed_weight,
     const ValueRef packed_weight_sums,
     const ValueRef packed_weight_scales,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
     const ValueRef bias_data,
     const ValueRef packed_bias,
     const ValueRef kernel_size,
@@ -400,42 +1113,45 @@ void add_conv2d_q8ta_q8csw_linear_node(
     const ValueRef padding,
     const ValueRef dilation,
     const ValueRef groups,
-    const ValueRef output_image) {
+    const ValueRef packed_int8_output) {
   Conv2DParams conv_params = create_conv2d_params(
       graph,
-      input_image,
-      output_image,
+      packed_int8_input,
+      packed_int8_output,
       kernel_size,
       stride,
       padding,
       dilation,
       groups);
 
-  // One limitation of the current implementation is that for grouped convs,
-  // the number of output channels per group must be a multiple of 4. One loaded
-  // 4x4 weight tile must all belong to the same group.
-  if (conv_params.groups > 1) {
-    VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0);
-  }
+  // Verify this is actually a depthwise convolution
+  const int64_t groups_val = graph.extract_scalar<int64_t>(groups);
+  const int64_t in_channels = graph.size_at<int64_t>(-3, packed_int8_input);
+  VK_CHECK_COND(groups_val == in_channels);
 
-  float scale = graph.extract_scalar<float>(input_scale);
-  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
+  float input_scale_val = graph.extract_scalar<float>(input_scale);
+  int32_t input_zp_val = graph.extract_scalar<int32_t>(input_zp);
 
-  std::string kernel_name = "conv2d_q8ta_q8csw_linear_tiled";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col));
+  float output_inv_scale_val = 1.0f / graph.extract_scalar<float>(output_scale);
+  int32_t output_zp_val = graph.extract_scalar<int32_t>(output_zp);
+
+  std::string kernel_name = "conv2d_dw_q8ta_q8csw_q8to";
+  add_storage_type_suffix(
+      kernel_name, graph.storage_type_of(packed_int8_output));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
+  add_dtype_suffix(kernel_name, graph.dtype_of(packed_weight_scales));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
   vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(output_image),
-      graph.sizes_ubo(input_image),
+      graph.sizes_ubo(packed_int8_output),
+      graph.sizes_ubo(packed_int8_input),
       graph.create_params_buffer(conv_params)};
 
   std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&scale, sizeof(scale)),
-      PushConstantDataInfo(&zp, sizeof(zp)),
+      PushConstantDataInfo(&input_scale_val, sizeof(input_scale_val)),
+      PushConstantDataInfo(&input_zp_val, sizeof(input_zp_val)),
+      PushConstantDataInfo(&output_inv_scale_val, sizeof(output_inv_scale_val)),
+      PushConstantDataInfo(&output_zp_val, sizeof(output_zp_val)),
   };
 
   uint32_t apply_bias = 1;
@@ -446,11 +1162,11 @@ void add_conv2d_q8ta_q8csw_linear_node(
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      col2im_global_wg_size,
-      quantized_linear_local_wg_size,
+      int8_conv2d_dw_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
-      {{output_image, vkapi::kWrite},
-       {{input_int_im2col,
+      {{packed_int8_output, vkapi::kWrite},
+       {{packed_int8_input,
          packed_weight,
          packed_weight_sums,
          packed_weight_scales,
@@ -463,7 +1179,7 @@ void add_conv2d_q8ta_q8csw_linear_node(
       // Specialization Constants
       {apply_bias},
       // Resize args
-      {weight_data},
+      {},
       // Resizing Logic
       nullptr));
 }
@@ -564,16 +1280,12 @@ void quantized_conv2d_impl(
     ValueRef packed_weight_sums = prepack_standard(
         graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
 
-    // Allocate quantized + packed im2col matrix for input
-    const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0));
-    const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1));
-
     TmpTensor input_int_im2col(
         &graph,
-        {num_blocks_M, num_blocks_K * 4},
-        vkapi::kInt,
+        input_im2col_sizes,
+        vkapi::kInt8x4,
         utils::kBuffer,
-        utils::kWidthPacked);
+        utils::kPackedInt8_4H4W);
 
     add_quantize_and_pack_im2col_node(
         graph,
@@ -687,9 +1399,343 @@ void conv2d_q8csw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
       output_image);
 }
 
+// Implementation for statically quantized conv2d, which expects input, weight,
+// and output tensors to all have packed int8 dtype/memory layout.
+void static_quantized_conv2d_impl(
+    ComputeGraph& graph,
+    const QuantizationConfig& input_quant_config,
+    const QuantizationConfig& weight_quant_config,
+    const QuantizationConfig& output_quant_config,
+    const ValueRef packed_int8_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef weight_data,
+    const ValueRef weight_sums_data,
+    const ValueRef weight_scales_data,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef bias_data,
+    const ValueRef kernel_size,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef groups,
+    const ValueRef packed_int8_output) {
+  // Currently, only certain quantization configs are supported
+  VK_CHECK_COND(input_quant_config.granularity == kPerTensor);
+  VK_CHECK_COND(input_quant_config.nbits == 8);
+
+  VK_CHECK_COND(weight_quant_config.granularity == kPerChannel);
+  VK_CHECK_COND(weight_quant_config.nbits == 8);
+  VK_CHECK_COND(weight_quant_config.is_symmetric);
+
+  VK_CHECK_COND(output_quant_config.granularity == kPerTensor);
+  VK_CHECK_COND(output_quant_config.nbits == 8);
+
+  // Check for depthwise conv
+  const int64_t groups_val = graph.extract_scalar<int64_t>(groups);
+  const int64_t in_channels = graph.size_at<int64_t>(-3, packed_int8_input);
+
+  // Depthwise convs have a specialized implementation, since the regular conv
+  // implementations requires that the number of input and output channels per
+  // groups is a multiple of 4. This is so that all values that are part of the
+  // same 4Wx4C block have the same group index.
+  const bool is_depthwise = (groups_val == in_channels);
+
+  const bool use_im2col = should_use_im2col(&graph, kernel_size, groups);
+  // For pointwise convolution with stride = 1, padding = 0, dilation = 1, the
+  // input tensor is already equivalent to its im2col representation. In this
+  // case we can skip the im2col procedure and pass in the input image to the
+  // convolution_as_matmul implementation directly.
+  const bool is_optimizable_pw =
+      is_s1p0d1_pointwise(&graph, kernel_size, stride, padding, dilation);
+
+  ValueRef packed_weight;
+  if (is_depthwise) {
+    packed_weight = prepack_quantized_conv2d_dw_weight(
+        graph, weight_quant_config, weight_data, kernel_size);
+  } else if (use_im2col) {
+    packed_weight = prepack_quantized_linear_weight(
+        graph, weight_quant_config, weight_data);
+  } else {
+    packed_weight = prepack_quantized_conv2d_weight(
+        graph,
+        weight_quant_config,
+        weight_data,
+        packed_int8_input,
+        packed_int8_output,
+        groups,
+        kernel_size);
+  }
+
+  ValueRef packed_weight_sums = prepack_standard(
+      graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
+
+  ValueRef packed_weight_scales = prepack_standard(
+      graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked);
+
+  // See quantized_conv2d_impl for why this is needed
+  TmpTensor dummy_bias(
+      &graph,
+      {},
+      graph.dtype_of(weight_scales_data),
+      utils::kBuffer,
+      utils::kWidthPacked);
+
+  ValueRef packed_bias = dummy_bias.vref;
+  if (graph.val_is_not_none(bias_data)) {
+    packed_bias =
+        prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked);
+  }
+
+  // Depthwise conv path
+  if (is_depthwise) {
+    add_conv2d_dw_q8ta_q8csw_q8to_node(
+        graph,
+        packed_int8_input,
+        input_scale,
+        input_zp,
+        packed_weight,
+        packed_weight_sums,
+        packed_weight_scales,
+        output_scale,
+        output_zp,
+        bias_data,
+        packed_bias,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        packed_int8_output);
+    return;
+  }
+
+  std::vector<int64_t> input_im2col_sizes =
+      calculate_packed_int8_input_im2col_sizes(
+          &graph, packed_int8_input, packed_int8_output, kernel_size, groups);
+
+  ValueRef packed_int8_input_im2col = packed_int8_input;
+  if (use_im2col && !is_optimizable_pw) {
+    TmpTensor packed_int8_input_im2col_tensor(
+        &graph,
+        input_im2col_sizes,
+        vkapi::kInt8x4,
+        utils::kBuffer,
+        utils::kPackedInt8_4W4C);
+
+    packed_int8_input_im2col = packed_int8_input_im2col_tensor.vref;
+
+    add_input_im2col_packed_int8_node(
+        graph,
+        packed_int8_input,
+        input_scale,
+        input_zp,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        packed_int8_output,
+        packed_int8_input_im2col);
+  }
+
+  add_conv2d_q8ta_q8csw_q8to_node(
+      graph,
+      packed_int8_input,
+      packed_int8_input_im2col,
+      input_scale,
+      input_zp,
+      packed_weight,
+      packed_weight_sums,
+      packed_weight_scales,
+      output_scale,
+      output_zp,
+      bias_data,
+      packed_bias,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output);
+}
+
+void conv2d_q8ta_q8csw_q8to(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef packed_int8_input = args.at(idx++);
+  const ValueRef input_scale = args.at(idx++);
+  const ValueRef input_zp = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef weight_sums_data = args.at(idx++);
+  const ValueRef weight_scales_data = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef kernel_size = args.at(idx++);
+  const ValueRef stride = args.at(idx++);
+  const ValueRef padding = args.at(idx++);
+  const ValueRef dilation = args.at(idx++);
+  const ValueRef groups = args.at(idx++);
+  const ValueRef packed_int8_output = args.at(idx++);
+
+  QuantizationConfig input_quant_config(8, kPerTensor, {});
+  QuantizationConfig weight_quant_config(8, kPerChannel, {});
+  QuantizationConfig output_quant_config(8, kPerTensor, {});
+
+  static_quantized_conv2d_impl(
+      graph,
+      input_quant_config,
+      weight_quant_config,
+      output_quant_config,
+      packed_int8_input,
+      input_scale,
+      input_zp,
+      weight_data,
+      weight_sums_data,
+      weight_scales_data,
+      output_scale,
+      output_zp,
+      bias_data,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output);
+}
+
+//
+// Quantize and dequantize operators
+//
+
+void quantize_q8ta_for_conv2d(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef scale = args.at(idx++);
+  const ValueRef zero_point = args.at(idx++);
+  const ValueRef packed_int8_input = args.at(idx++);
+
+  add_quantize_and_pack_q8ta_conv2d_input_node(
+      graph, fp_input, scale, zero_point, packed_int8_input);
+}
+
+void dequantize_q8to_from_conv2d(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef packed_int8_output = args.at(idx++);
+  const ValueRef scale = args.at(idx++);
+  const ValueRef zero_point = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  add_unpack_and_dequantize_q8ta_conv2d_output_node(
+      graph, packed_int8_output, scale, zero_point, fp_output);
+}
+
+void qdq8ta_conv2d_input(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef scale = args.at(idx++);
+  const ValueRef zero_point = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  TmpTensor packed_int8_input(
+      &graph,
+      graph.sizes_of(fp_input),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      utils::kPackedInt8_4W4C);
+
+  add_quantize_and_pack_q8ta_conv2d_input_node(
+      graph, fp_input, scale, zero_point, packed_int8_input);
+
+  add_unpack_and_dequantize_q8ta_conv2d_output_node(
+      graph, packed_int8_input, scale, zero_point, fp_output);
+}
+
+//
+// Test operators
+//
+
+void conv2d_q8ta_q8csw_q8to_test(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& args) {
+  int32_t idx = 0;
+  const ValueRef fp_input = args.at(idx++);
+  const ValueRef input_scale = args.at(idx++);
+  const ValueRef input_zp = args.at(idx++);
+  const ValueRef weight_data = args.at(idx++);
+  const ValueRef weight_sums_data = args.at(idx++);
+  const ValueRef weight_scales_data = args.at(idx++);
+  const ValueRef output_scale = args.at(idx++);
+  const ValueRef output_zp = args.at(idx++);
+  const ValueRef bias_data = args.at(idx++);
+  const ValueRef kernel_size = args.at(idx++);
+  const ValueRef stride = args.at(idx++);
+  const ValueRef padding = args.at(idx++);
+  const ValueRef dilation = args.at(idx++);
+  const ValueRef groups = args.at(idx++);
+  const ValueRef fp_output = args.at(idx++);
+
+  TmpTensor packed_int8_input(
+      &graph,
+      graph.sizes_of(fp_input),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      utils::kPackedInt8_4W4C);
+
+  TmpTensor packed_int8_output(
+      &graph,
+      graph.sizes_of(fp_output),
+      vkapi::kInt8x4,
+      utils::kBuffer,
+      utils::kPackedInt8_4W4C);
+
+  add_quantize_and_pack_q8ta_conv2d_input_node(
+      graph, fp_input, input_scale, input_zp, packed_int8_input);
+
+  std::vector<ValueRef> conv2d_args = {
+      packed_int8_input,
+      input_scale,
+      input_zp,
+      weight_data,
+      weight_sums_data,
+      weight_scales_data,
+      output_scale,
+      output_zp,
+      bias_data,
+      kernel_size,
+      stride,
+      padding,
+      dilation,
+      groups,
+      packed_int8_output};
+
+  conv2d_q8ta_q8csw_q8to(graph, conv2d_args);
+
+  add_unpack_and_dequantize_q8ta_conv2d_output_node(
+      graph, packed_int8_output, output_scale, output_zp, fp_output);
+}
+
 REGISTER_OPERATORS {
   VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw);
   VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw);
+  VK_REGISTER_OP(etvk.qdq8ta_conv2d_input.default, qdq8ta_conv2d_input);
+  VK_REGISTER_OP(etvk.conv2d_q8ta_q8csw_q8to.test, conv2d_q8ta_q8csw_q8to_test);
+  VK_REGISTER_OP(
+      et_vk.quantize_q8ta_for_conv2d.default, quantize_q8ta_for_conv2d);
+  VK_REGISTER_OP(
+      et_vk.dequantize_q8to_from_conv2d.default, dequantize_q8to_from_conv2d);
+  VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw_q8to.default, conv2d_q8ta_q8csw_q8to);
+  VK_REGISTER_OP(
+      et_vk.conv2d_q8ta_q8csw_q8to_dw.default, conv2d_q8ta_q8csw_q8to);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h
new file mode 100644
index 00000000000..33474cee47b
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+//
+// Quantize and dequantize functions for conv2d that can be reused by other
+// operations
+//
+
+/**
+ * Add a dispatch node to quantize a floating-point input tensor to a packed
+ * int8 tensor for use in quantized operations.
+ */
+void add_quantize_and_pack_q8ta_conv2d_input_node(
+    ComputeGraph& graph,
+    const ValueRef fp_input,
+    const ValueRef input_scale,
+    const ValueRef input_zp,
+    const ValueRef packed_int8_input);
+
+/**
+ * Add a dispatch node to unpack and dequantize a packed int8 output tensor back
+ * to a floating-point tensor.
+ */
+void add_unpack_and_dequantize_q8ta_conv2d_output_node(
+    ComputeGraph& graph,
+    const ValueRef packed_int8_output,
+    const ValueRef output_scale,
+    const ValueRef output_zp,
+    const ValueRef fp_output);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
index 7fbfcee5cb1..97566038501 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -77,6 +77,10 @@ utils::uvec3 quantized_linear_global_wg_size(
     M_per_tile = 1;
   }
 
+  if (shader.kernel_name.find("q8ta_q8csw_tiled") != std::string::npos) {
+    N_per_tile = 8;
+  }
+
   const uint32_t num_N_tiles = utils::div_up(N, N_per_tile);
   const uint32_t num_M_tiles = utils::div_up(M, M_per_tile);
 
@@ -802,20 +806,12 @@ void quantized_linear_impl(
       graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
 
   // Allocate temporary tensor to store quantized and packed input
-
-  int64_t num_blocks_M, num_blocks_K;
-  std::tie(num_blocks_M, num_blocks_K) =
-      get_quantized_input_num_blocks(graph, fp_input);
-
-  const int64_t int_input_height = num_blocks_M;
-  const int64_t int_input_width = num_blocks_K * 4;
-
   TmpTensor packed_int_input(
       &graph,
-      {int_input_height, int_input_width},
-      vkapi::kInt,
+      graph.sizes_of(fp_input),
+      vkapi::kInt8x4,
       utils::kBuffer,
-      utils::kWidthPacked);
+      utils::kPackedInt8_4H4W);
 
   // Non dynamically quantized input case
   if (!input_quant_config.is_dynamic) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
index 13801b45cc7..e2b73b2f3f2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
@@ -32,8 +32,13 @@ void add_squeeze_copy_dims_node(
   // 2. Squeeze outter most dim
   // For these cases, just pass input to output via clone.
   for (int i = 0; i < dims.size(); ++i) {
-    if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) {
-      squeeze_dims.push_back(dims.at(i));
+    // adjust negative dims
+    int64_t dim_val = dims.at(i);
+    if (dim_val < 0) {
+      dim_val += in_dim;
+    }
+    if (dims.at(i) != 0 && in_sizes.at(dim_val) == 1) {
+      squeeze_dims.push_back(dim_val);
     }
   }
   if (squeeze_dims.size() == 0) {
diff --git a/backends/vulkan/runtime/utils/StorageUtils.cpp b/backends/vulkan/runtime/utils/StorageUtils.cpp
new file mode 100644
index 00000000000..cfe3d9e159a
--- /dev/null
+++ b/backends/vulkan/runtime/utils/StorageUtils.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
+
+namespace vkcompute {
+namespace utils {
+
+bool is_packed_int8_layout(const GPUMemoryLayout layout) {
+  switch (layout) {
+    case kPackedInt8_4W4C:
+    case kPackedInt8_4H4W:
+      return true;
+    default:
+      return false;
+  }
+}
+
+} // namespace utils
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
index 20addf88c53..76edec897c7 100644
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ b/backends/vulkan/runtime/utils/StorageUtils.h
@@ -84,9 +84,24 @@ enum class GPUMemoryLayout : uint8_t {
    * 2. For texture backed tensors, the packed dim will be the specified dim.
    *    The axis map will be `{0, 1, 2, 2}`.
    */
+
   TENSOR_WIDTH_PACKED = 0u,
   TENSOR_HEIGHT_PACKED = 1u,
   TENSOR_CHANNELS_PACKED = 2u,
+
+  /*
+   * The following memory layouts are used for quantized int8 tensors. For the
+   * above "standard" memory layouts, 4 elements along the packed dim are stored
+   * in each texel (4-component vectorized type). However, for packed int8
+   * memory layouts, an additional level of packing is used where 4 int8 values
+   * are packed into each int32, and each int32 is packed into each ivec4.
+   * Conceptually, this allows an additional packed dimension to be used.
+   * When loading a ivec4 from the GPU storage buffer / texture, data for a
+   * 16 element block is loaded, rather than 4 elements along one dimension.
+   */
+
+  TENSOR_PACKED_INT8_4W4C = 3u,
+  TENSOR_PACKED_INT8_4H4W = 4u,
 };
 
 static constexpr GPUMemoryLayout kWidthPacked =
@@ -98,6 +113,12 @@ static constexpr GPUMemoryLayout kHeightPacked =
 static constexpr GPUMemoryLayout kChannelsPacked =
     GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
 
+static constexpr GPUMemoryLayout kPackedInt8_4W4C =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4W4C;
+
+static constexpr GPUMemoryLayout kPackedInt8_4H4W =
+    GPUMemoryLayout::TENSOR_PACKED_INT8_4H4W;
+
 template <typename T>
 T to_packed_dim(const GPUMemoryLayout layout) {
   switch (layout) {
@@ -107,11 +128,17 @@ T to_packed_dim(const GPUMemoryLayout layout) {
       return 1;
     case kChannelsPacked:
       return 2;
+    case kPackedInt8_4W4C:
+      return 2;
+    case kPackedInt8_4H4W:
+      return 0;
   };
   // Should be unreachable
   return 0;
 }
 
+bool is_packed_int8_layout(const GPUMemoryLayout layout);
+
 inline std::ostream& operator<<(
     std::ostream& os,
     const StorageType storage_type) {
@@ -142,6 +169,12 @@ inline std::ostream& operator<<(
     case kChannelsPacked:
       os << "TENSOR_CHANNELS_PACKED";
       break;
+    case kPackedInt8_4W4C:
+      os << "TENSOR_PACKED_INT8_4W4C";
+      break;
+    case kPackedInt8_4H4W:
+      os << "TENSOR_PACKED_INT8_4H4W";
+      break;
   }
   return os;
 }
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
index d3efa81e52a..5bcf047aaf1 100644
--- a/backends/vulkan/runtime/vk_api/Exception.cpp
+++ b/backends/vulkan/runtime/vk_api/Exception.cpp
@@ -10,6 +10,13 @@
 
 #include <sstream>
 
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif // _GNU_SOURCE
+#include <boost/stacktrace.hpp>
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
+
 namespace vkcompute {
 namespace vkapi {
 
@@ -65,6 +72,11 @@ Error::Error(SourceLocation source_location, std::string msg)
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
@@ -74,6 +86,11 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
   oss << "Exception raised from " << source_location_ << ": ";
   oss << "(" << cond << ") is false! ";
   oss << msg_;
+#ifdef ETVK_BOOST_STACKTRACE_AVAILABLE
+  oss << "\n";
+  oss << "Stack trace:\n";
+  oss << boost::stacktrace::stacktrace();
+#endif // ETVK_BOOST_STACKTRACE_AVAILABLE
   what_ = oss.str();
 }
 
diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h
index b3309aa6c69..f4415b5c08f 100644
--- a/backends/vulkan/runtime/vk_api/Types.h
+++ b/backends/vulkan/runtime/vk_api/Types.h
@@ -43,7 +43,8 @@
   _(double, VK_FORMAT_R64G64B64A64_SFLOAT, Double) \
   _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8)        \
   _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8)      \
-  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)  \
+  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int8x4)
 
 namespace vkcompute {
 namespace vkapi {
diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs
index 4bc12208ce7..9d738bc386f 100644
--- a/backends/vulkan/serialization/schema.fbs
+++ b/backends/vulkan/serialization/schema.fbs
@@ -40,6 +40,8 @@ enum VkMemoryLayout : ubyte {
   TENSOR_WIDTH_PACKED = 0,
   TENSOR_HEIGHT_PACKED = 1,
   TENSOR_CHANNELS_PACKED = 2,
+  PACKED_INT8_4W4C = 3,
+  PACKED_INT8_4H4W = 4,
   DEFAULT_LAYOUT = 255,
 }
 
diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py
index cf5326f40cf..236183ce42f 100644
--- a/backends/vulkan/serialization/vulkan_graph_schema.py
+++ b/backends/vulkan/serialization/vulkan_graph_schema.py
@@ -48,6 +48,8 @@ class VkMemoryLayout(IntEnum):
     TENSOR_WIDTH_PACKED = 0
     TENSOR_HEIGHT_PACKED = 1
     TENSOR_CHANNELS_PACKED = 2
+    PACKED_INT8_4W4C = 3
+    PACKED_INT8_4H4W = 4
     DEFAULT_LAYOUT = 255
 
     def __str__(self) -> str:
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index a9ba62b6f9f..c48ce0a452b 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -19,6 +19,8 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
     default_flags = []
     android_flags = []
 
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     if not no_volk:
         for flags in [default_flags, android_flags]:
             flags.append("-DUSE_VULKAN_WRAPPER")
@@ -32,6 +34,10 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if link_moltenvk:
             mac_flags = []
 
+        if debug_mode:
+            mac_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+            default_flags.append("-DETVK_BOOST_STACKTRACE_AVAILABLE")
+
         VK_API_PREPROCESSOR_FLAGS += select({
             "DEFAULT": default_flags,
             "ovr_config//os:android": android_flags,
@@ -59,7 +65,6 @@ def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
         if etvk_default_cache_path != "":
             VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)]
 
-        debug_mode = read_config("etvk", "debug", "0") == "1"
         if debug_mode:
             VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"]
 
@@ -136,6 +141,8 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal
     )
 
 def define_common_targets(is_fbcode = False):
+    debug_mode = read_config("etvk", "debug", "0") == "1"
+
     runtime.python_library(
         name = "gen_vulkan_spv_lib",
         srcs = [
@@ -200,6 +207,10 @@ def define_common_targets(is_fbcode = False):
                     "//third-party/khronos:moltenVK_static"
                 ]
 
+            if debug_mode:
+                mac_deps.append("fbsource//third-party/boost:boost")
+                default_deps.append("fbsource//third-party/boost:boost")
+
             VK_API_DEPS += select({
                 "DEFAULT": default_deps,
                 "ovr_config//os:android": android_deps,
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
index 53fad86f90c..ee296a4f68f 100644
--- a/backends/vulkan/test/TARGETS
+++ b/backends/vulkan/test/TARGETS
@@ -34,7 +34,6 @@ python_unittest(
     deps = [
         "//caffe2:torch",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
-        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
         "//executorch/backends/vulkan:vulkan_preprocess",
         "//pytorch/ao:torchao",  # @manual
     ]
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index 97b632338db..fc1d33391d4 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -95,4 +95,8 @@ if(TARGET vulkan_backend)
   add_operator_prototype(q8csw_conv2d)
   add_operator_prototype(q4gsw_linear)
   add_operator_prototype(choose_qparams_per_row)
+  add_operator_prototype(qdq8ta_conv2d_activations)
+  add_operator_prototype(q8ta_q8csw_q8to_conv2d)
+  add_operator_prototype(q8ta_q8csw_q8to_conv2d_dw)
+  add_operator_prototype(q8ta_q8ta_q8to_add)
 endif()
diff --git a/backends/vulkan/test/custom_ops/conv2d_utils.cpp b/backends/vulkan/test/custom_ops/conv2d_utils.cpp
new file mode 100644
index 00000000000..74c26cef5a1
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/conv2d_utils.cpp
@@ -0,0 +1,10 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "conv2d_utils.h"
+
+// Implementation file for conv2d utilities.
+// Currently all functionality is implemented inline in the header.
diff --git a/backends/vulkan/test/custom_ops/conv2d_utils.h b/backends/vulkan/test/custom_ops/conv2d_utils.h
new file mode 100644
index 00000000000..cad52219062
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/conv2d_utils.h
@@ -0,0 +1,88 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace executorch {
+namespace vulkan {
+namespace prototyping {
+
+// Component structs for better readability
+struct KernelSize {
+  int32_t h;
+  int32_t w;
+
+  KernelSize(int32_t height, int32_t width) : h(height), w(width) {}
+};
+
+struct Stride {
+  int32_t h;
+  int32_t w;
+
+  Stride(int32_t height, int32_t width) : h(height), w(width) {}
+};
+
+struct Padding {
+  int32_t h;
+  int32_t w;
+
+  Padding(int32_t height, int32_t width) : h(height), w(width) {}
+};
+
+struct Dilation {
+  int32_t h;
+  int32_t w;
+
+  Dilation(int32_t height = 1, int32_t width = 1) : h(height), w(width) {}
+};
+
+struct OutInChannels {
+  int32_t out;
+  int32_t in;
+
+  OutInChannels(int32_t out_channels, int32_t in_channels)
+      : out(out_channels), in(in_channels) {}
+};
+
+struct InputSize2D {
+  int32_t h;
+  int32_t w;
+
+  InputSize2D(int32_t height, int32_t width) : h(height), w(width) {}
+};
+
+// Conv2d configuration struct
+struct Conv2dConfig {
+  OutInChannels channels;
+  InputSize2D input_size;
+  KernelSize kernel;
+  Stride stride;
+  Padding padding;
+  Dilation dilation;
+  int32_t groups; // Number of groups for grouped convolution
+  std::string test_case_name = "placeholder";
+  std::string op_name = "conv2d";
+
+  // Calculate output dimensions
+  int64_t get_output_height() const {
+    return (input_size.h + 2 * padding.h - dilation.h * (kernel.h - 1) - 1) /
+        stride.h +
+        1;
+  }
+
+  int64_t get_output_width() const {
+    return (input_size.w + 2 * padding.w - dilation.w * (kernel.w - 1) - 1) /
+        stride.w +
+        1;
+  }
+};
+
+} // namespace prototyping
+} // namespace vulkan
+} // namespace executorch
diff --git a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
index d566e5b2646..219bccb04c3 100644
--- a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
+++ b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
@@ -8,6 +8,7 @@
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <iostream>
 #include <vector>
+#include "conv2d_utils.h"
 #include "utils.h"
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
@@ -18,76 +19,6 @@ using namespace vkcompute;
 
 static constexpr int64_t kRefDimSizeLimit = 100;
 
-// Component structs for better readability
-struct KernelSize {
-  int32_t h;
-  int32_t w;
-
-  KernelSize(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-struct Stride {
-  int32_t h;
-  int32_t w;
-
-  Stride(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-struct Padding {
-  int32_t h;
-  int32_t w;
-
-  Padding(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-struct Dilation {
-  int32_t h;
-  int32_t w;
-
-  Dilation(int32_t height = 1, int32_t width = 1) : h(height), w(width) {}
-};
-
-struct OutInChannels {
-  int32_t out;
-  int32_t in;
-
-  OutInChannels(int32_t out_channels, int32_t in_channels)
-      : out(out_channels), in(in_channels) {}
-};
-
-struct InputSize2D {
-  int32_t h;
-  int32_t w;
-
-  InputSize2D(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-// Conv2d configuration struct
-struct Conv2dConfig {
-  OutInChannels channels;
-  InputSize2D input_size;
-  KernelSize kernel;
-  Stride stride;
-  Padding padding;
-  Dilation dilation;
-  int32_t groups; // Number of groups for grouped convolution
-  std::string test_case_name = "placeholder";
-  std::string op_name = "conv2d_q8ta_q8csw";
-
-  // Calculate output dimensions
-  int64_t get_output_height() const {
-    return (input_size.h + 2 * padding.h - dilation.h * (kernel.h - 1) - 1) /
-        stride.h +
-        1;
-  }
-
-  int64_t get_output_width() const {
-    return (input_size.w + 2 * padding.w - dilation.w * (kernel.w - 1) - 1) /
-        stride.w +
-        1;
-  }
-};
-
 // Utility function to create a test case from a Conv2dConfig
 TestCase create_test_case_from_config(
     const Conv2dConfig& config,
@@ -366,13 +297,20 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
        Stride(1, 1),
        Padding(1, 1),
        Dilation(1, 1),
-       8},
+       1},
       {OutInChannels(128, 64),
        InputSize2D(128, 128),
        KernelSize(3, 3),
        Stride(1, 1),
        Padding(1, 1),
        Dilation(1, 1),
+       1},
+      {OutInChannels(128, 1024),
+       InputSize2D(128, 128),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
        1}};
 
   // Test with different storage types and data types
@@ -394,6 +332,7 @@ std::vector<TestCase> generate_quantized_conv2d_test_cases() {
           std::to_string(config.kernel.h) + "/" +
           std::to_string(config.kernel.w);
 
+      config.op_name = "conv2d_q8ta_q8csw";
       config.test_case_name = prefix + suffix;
       // The default operator tested is activation + weight quantized conv2d;
       // however, only test this if the int8 dot product extension is supported
@@ -763,7 +702,7 @@ int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) {
 int main(int argc, char* argv[]) {
   set_debugging(false);
   set_print_output(false);
-  set_print_latencies(false);
+  set_print_latencies(true);
   set_use_gpu_timestamps(true);
 
   print_performance_header();
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
new file mode 100644
index 00000000000..8762fe4c0d1
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d.cpp
@@ -0,0 +1,628 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <iostream>
+#include <vector>
+#include "conv2d_utils.h"
+#include "utils.h"
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+using namespace executorch::vulkan::prototyping;
+
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 100;
+
+// Utility function to create a test case from a Conv2dConfig
+TestCase create_test_case_from_config(
+    const Conv2dConfig& config,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype) {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  std::string storage_str =
+      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
+  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
+
+  std::string test_name =
+      config.test_case_name + "_" + storage_str + "_" + dtype_str;
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  std::string operator_name = "etvk." + config.op_name + ".test";
+  test_case.set_operator_name(operator_name);
+
+  // Calculate output dimensions
+  int64_t H_out = config.get_output_height();
+  int64_t W_out = config.get_output_width();
+
+  // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1)
+  std::vector<int64_t> input_size = {
+      1, config.channels.in, config.input_size.h, config.input_size.w};
+
+  ValueSpec input_tensor(
+      input_size,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::RANDOM);
+
+  if (debugging()) {
+    print_valuespec_data(input_tensor, "input_tensor");
+  }
+
+  float input_scale_val = 0.008123;
+  ValueSpec input_scale(input_scale_val);
+
+  int32_t input_zero_point_val = 2;
+  ValueSpec input_zero_point(input_zero_point_val);
+
+  // Quantized weight tensor (int8) - [C_out, C_in_per_group * K_h * K_w]
+  // Memory layout: height, width, then channels - in_c is innermost (stride 1)
+  // in the second dimension
+  const int64_t in_channels_per_group = config.channels.in / config.groups;
+  const int64_t in_features = utils::align_up_4(
+      in_channels_per_group * config.kernel.h * config.kernel.w);
+  std::vector<int64_t> weight_size = {config.channels.out, in_features};
+  ValueSpec quantized_weight(
+      weight_size,
+      vkapi::kChar, // int8 for quantized weights
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::RANDINT8);
+  quantized_weight.set_constant(true);
+
+  if (debugging()) {
+    print_valuespec_data(quantized_weight, "weight_tensor");
+  }
+
+  const int64_t aligned_out_channels = utils::align_up_4(config.channels.out);
+
+  // Weight quantization scales (float/half, per-channel)
+  ValueSpec weight_scales(
+      {aligned_out_channels}, // Per output channel
+      input_dtype,
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::RANDOM_SCALES);
+  weight_scales.set_constant(true);
+
+  ValueSpec weight_sums(
+      {aligned_out_channels}, // Per output channel
+      vkapi::kInt,
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::ZEROS);
+  weight_sums.set_constant(true);
+
+  // Compute weight_sums data based on quantized weights
+  compute_weight_sums(
+      weight_sums, quantized_weight, config.channels.out, in_features);
+
+  // Bias (optional, float/half) - [C_out]
+  ValueSpec bias(
+      {aligned_out_channels}, // Per output channel
+      input_dtype,
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::ZEROS);
+  bias.set_constant(true);
+
+  // Output quantization parameters
+  // float output_scale_val = 0.01432;
+  float output_scale_val = 0.05314;
+  ValueSpec output_scale(output_scale_val);
+
+  int32_t output_zero_point_val = -1;
+  ValueSpec output_zero_point(output_zero_point_val);
+
+  // Stride and padding parameters
+  ValueSpec stride({config.stride.h, config.stride.w});
+  ValueSpec padding({config.padding.h, config.padding.w});
+
+  // Dilation and groups parameters
+  ValueSpec dilation({config.dilation.h, config.dilation.w});
+  ValueSpec groups(config.groups);
+
+  // Kernel size parameters
+  ValueSpec kernel_size({config.kernel.h, config.kernel.w});
+
+  // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1)
+  ValueSpec output(
+      {1, config.channels.out, H_out, W_out},
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case for q8ta_q8csw_q8to operation
+  test_case.add_input_spec(input_tensor);
+  test_case.add_input_spec(input_scale);
+  test_case.add_input_spec(input_zero_point);
+  test_case.add_input_spec(quantized_weight);
+  test_case.add_input_spec(weight_sums);
+  test_case.add_input_spec(weight_scales);
+  test_case.add_input_spec(output_scale);
+  test_case.add_input_spec(output_zero_point);
+  test_case.add_input_spec(bias);
+  test_case.add_input_spec(kernel_size);
+  test_case.add_input_spec(stride);
+  test_case.add_input_spec(padding);
+  test_case.add_input_spec(dilation);
+  test_case.add_input_spec(groups);
+
+  test_case.add_output_spec(output);
+
+  test_case.set_abs_tolerance(output_scale_val + 1e-4f);
+
+  return test_case;
+}
+
+// Generate easy test cases for quantized conv2d operation (for debugging)
+std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Single simple configuration for debugging
+  Conv2dConfig config = {
+      OutInChannels(16, 8), // channels (out, in)
+      InputSize2D(21, 17), // input_size (h, w)
+      KernelSize(3, 3), // kernel
+      Stride(1, 1), // stride
+      Padding(1, 1), // padding
+      Dilation(1, 1), // dilation
+      2, // groups
+  };
+  config.op_name = "conv2d_q8ta_q8csw_q8to";
+
+  // Test with both storage types and data types for completeness
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination
+  for (const auto& storage_type : storage_types) {
+    for (const auto& input_dtype : float_types) {
+      test_cases.push_back(
+          create_test_case_from_config(config, storage_type, input_dtype));
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for quantized conv2d operation
+std::vector<TestCase> generate_quantized_conv2d_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<Conv2dConfig> configs = {
+      // Pointwise convolutions: kernel size 1x1
+      {OutInChannels(32, 3),
+       InputSize2D(64, 64),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(64, 32),
+       InputSize2D(32, 32),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(96, 64),
+       InputSize2D(16, 16),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(13, 7),
+       InputSize2D(57, 33),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      // General 2D convolutions
+      {OutInChannels(32, 3),
+       InputSize2D(64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(32, 3),
+       InputSize2D(64, 64),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(64, 32),
+       InputSize2D(8, 8),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(64, 32),
+       InputSize2D(64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(64, 32),
+       InputSize2D(64, 64),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(16, 32),
+       InputSize2D(77, 77),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      // Grouped convolutions
+      {OutInChannels(64, 32),
+       InputSize2D(64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       2},
+      {OutInChannels(96, 96),
+       InputSize2D(81, 81),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       3},
+      {OutInChannels(96, 96),
+       InputSize2D(64, 64),
+       KernelSize(5, 5),
+       Stride(2, 2),
+       Padding(2, 2),
+       Dilation(1, 1),
+       4},
+      // Performance cases (pointwise)
+      {OutInChannels(128, 128),
+       InputSize2D(128, 128),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(128, 128),
+       InputSize2D(128, 128),
+       KernelSize(1, 1),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      // Performance cases (general 2d convs)
+      {OutInChannels(32, 3),
+       InputSize2D(256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(0, 0),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(64, 32),
+       InputSize2D(128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(64, 64),
+       InputSize2D(128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       1},
+      {OutInChannels(128, 128),
+       InputSize2D(128, 128),
+       KernelSize(5, 5),
+       Stride(2, 2),
+       Padding(2, 2),
+       Dilation(1, 1),
+       4}};
+
+  // Test with different storage types and data types
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+
+  // Generate test cases for each combination
+  for (auto& config : configs) {
+    for (const auto& storage_type : storage_types) {
+      // Generate test case name programmatically
+      bool is_performance = config.channels.out > kRefDimSizeLimit ||
+          config.channels.in > kRefDimSizeLimit ||
+          config.input_size.h > kRefDimSizeLimit ||
+          config.input_size.w > kRefDimSizeLimit;
+      std::string prefix = is_performance ? "performance_" : "correctness_";
+      std::string suffix = std::to_string(config.channels.out) + "/" +
+          std::to_string(config.channels.in) + "_" +
+          std::to_string(config.input_size.h) + "/" +
+          std::to_string(config.input_size.w) + "_" +
+          std::to_string(config.kernel.h) + "/" +
+          std::to_string(config.kernel.w);
+
+      config.op_name = "conv2d_q8ta_q8csw_q8to";
+      config.test_case_name = prefix + suffix;
+
+      // Only test q8ta_q8csw_q8to if the int8 dot product extension is
+      // supported
+      if (vkcompute::api::context()
+              ->adapter_ptr()
+              ->supports_int8_dot_product()) {
+        test_cases.push_back(
+            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for activation, weight, and output quantized conv2d
+void conv2d_q8ta_q8csw_q8to_reference_impl(TestCase& test_case) {
+  // Extract input specifications
+  int32_t idx = 0;
+  const ValueSpec& input_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_zeros_spec = test_case.inputs()[idx++];
+  const ValueSpec& weight_spec = test_case.inputs()[idx++];
+  const ValueSpec& weight_sums_spec = test_case.inputs()[idx++];
+  (void)weight_sums_spec;
+  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_zeros_spec = test_case.inputs()[idx++];
+  const ValueSpec& bias_spec = test_case.inputs()[idx++];
+  const ValueSpec& kernel_size_spec = test_case.inputs()[idx++];
+  const ValueSpec& stride_spec = test_case.inputs()[idx++];
+  const ValueSpec& padding_spec = test_case.inputs()[idx++];
+  const ValueSpec& dilation_spec = test_case.inputs()[idx++];
+  const ValueSpec& groups_spec = test_case.inputs()[idx++];
+
+  // Extract output specification (mutable reference)
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions
+  auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in]
+  auto weight_sizes =
+      weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w]
+  auto output_sizes =
+      output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out]
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H_in = input_sizes[2];
+  int64_t W_in = input_sizes[3];
+  int64_t C_out = output_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  // Get kernel dimensions from kernel_size ValueSpec
+  auto kernel_size_data = kernel_size_spec.get_int32_data();
+  int64_t K_h = kernel_size_data[0];
+  int64_t K_w = kernel_size_data[1];
+
+  // Get stride, padding, dilation, and groups
+  auto stride_data = stride_spec.get_int32_data();
+  auto padding_data = padding_spec.get_int32_data();
+  auto dilation_data = dilation_spec.get_int32_data();
+  int64_t stride_h = stride_data[0];
+  int64_t stride_w = stride_data[1];
+  int64_t pad_h = padding_data[0];
+  int64_t pad_w = padding_data[1];
+  int64_t dilation_h = dilation_data[0];
+  int64_t dilation_w = dilation_data[1];
+  int64_t groups = groups_spec.get_int_value();
+
+  // Skip for large tensors since computation time will be extremely slow
+  if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit ||
+      H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit ||
+      C_out > kRefDimSizeLimit) {
+    throw std::invalid_argument(
+        "One or more dimensions exceed the allowed limit for reference implementation.");
+  }
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Get raw data pointers
+  auto& input_data = input_spec.get_float_data();
+  const float input_scale = input_scale_spec.get_float_value();
+  const int32_t input_zero_point = input_zeros_spec.get_int_value();
+
+  auto& weight_data = weight_spec.get_int8_data();
+  auto& weight_scales_data = weight_scales_spec.get_float_data();
+  auto& bias_data = bias_spec.get_float_data();
+
+  const float output_scale = output_scale_spec.get_float_value();
+  const int32_t output_zero_point = output_zeros_spec.get_int_value();
+
+  // Calculate channels per group for grouped convolution
+  int64_t C_in_per_group = C_in / groups;
+  int64_t C_out_per_group = C_out / groups;
+
+  // Calculate number of output elements
+  int64_t num_output_elements = N * C_out * H_out * W_out;
+
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(num_output_elements);
+
+  const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w);
+
+  // Perform activation, weight, and output quantized conv2d operation
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t out_c = 0; out_c < C_out; ++out_c) {
+      for (int64_t out_h = 0; out_h < H_out; ++out_h) {
+        for (int64_t out_w = 0; out_w < W_out; ++out_w) {
+          int32_t int_sum = 0;
+          int32_t weight_sum = 0; // Track weight sum on the fly
+
+          // Determine which group this output channel belongs to
+          int64_t group_idx = out_c / C_out_per_group;
+          int64_t in_c_start = group_idx * C_in_per_group;
+          int64_t in_c_end = (group_idx + 1) * C_in_per_group;
+
+          // Convolution operation with integer accumulation
+          for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) {
+            for (int64_t kh = 0; kh < K_h; ++kh) {
+              for (int64_t kw = 0; kw < K_w; ++kw) {
+                // Calculate input position with dilation
+                int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h;
+                int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w;
+
+                // Check bounds (zero padding)
+                if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) {
+                  // Get input value and quantize to int8
+                  int64_t input_idx = n * (C_in * H_in * W_in) +
+                      in_c * (H_in * W_in) + in_h * W_in + in_w;
+
+                  float quant_input_f =
+                      std::round(input_data[input_idx] / input_scale) +
+                      input_zero_point;
+                  quant_input_f =
+                      std::min(std::max(quant_input_f, -128.0f), 127.0f);
+                  int8_t quantized_input = static_cast<int8_t>(quant_input_f);
+
+                  // Get quantized weight (already int8)
+                  // Weight layout: [C_out, C_in_per_group * K_h * K_w]
+                  int64_t weight_idx = out_c * in_features +
+                      (kh * (K_w * C_in_per_group) + kw * C_in_per_group +
+                       (in_c % C_in_per_group));
+                  int8_t quantized_weight = weight_data[weight_idx];
+
+                  // Integer multiplication and accumulation
+                  int_sum += static_cast<int32_t>(quantized_input) *
+                      static_cast<int32_t>(quantized_weight);
+
+                  // Track weight sum for this output channel on the fly
+                  weight_sum += static_cast<int32_t>(quantized_weight);
+                } else {
+                  // For zero padding, we still need to account for the weight
+                  // in weight_sum when input is effectively 0 (but quantized 0
+                  // is input_zero_point)
+                  int64_t weight_idx = out_c * in_features +
+                      (kh * (K_w * C_in_per_group) + kw * C_in_per_group +
+                       (in_c % C_in_per_group));
+                  int8_t quantized_weight = weight_data[weight_idx];
+
+                  // Add contribution from zero-padded input (quantized zero =
+                  // input_zero_point)
+                  int_sum += static_cast<int32_t>(input_zero_point) *
+                      static_cast<int32_t>(quantized_weight);
+
+                  // Track weight sum for this output channel on the fly
+                  weight_sum += static_cast<int32_t>(quantized_weight);
+                }
+              }
+            }
+          }
+
+          // Convert accumulated integer result to float and apply scales
+          // Final result = (int_sum - zero_point_correction) * input_scale *
+          // weight_scale + bias zero_point_correction = input_zero_point *
+          // sum_of_weights_for_this_output_channel
+          int32_t zero_point_correction = input_zero_point * weight_sum;
+          int32_t accum_adjusted = int_sum - zero_point_correction;
+          float float_result =
+              accum_adjusted * input_scale * weight_scales_data[out_c];
+
+          // Add bias and store result
+          float_result += bias_data[out_c];
+
+          // Quantize the output to int8
+          float quant_output_f =
+              std::round(float_result / output_scale) + output_zero_point;
+          quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f);
+          int8_t quantized_output = static_cast<int8_t>(quant_output_f);
+
+          // Dequantize back to float
+          float dequant_output =
+              (static_cast<float>(quantized_output) - output_zero_point) *
+              output_scale;
+
+          int64_t output_idx = n * (C_out * H_out * W_out) +
+              out_c * (H_out * W_out) + out_h * W_out + out_w;
+          ref_data[output_idx] = dequant_output;
+        }
+      }
+    }
+  }
+}
+
+void reference_impl(TestCase& test_case) {
+  conv2d_q8ta_q8csw_q8to_reference_impl(test_case);
+}
+
+// Custom FLOP calculator for quantized conv2d operation
+int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) {
+  int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to
+
+  // Get input and weight dimensions
+  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data();
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t C_out = output_sizes[1];
+  int64_t K_h = kernel_sizes[0];
+  int64_t K_w = kernel_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  // Calculate FLOPs for quantized conv2d operation
+  // Each output element requires:
+  // - C_in * K_h * K_w multiply-accumulate operations
+  // - Additional operations for quantization/dequantization
+  int64_t output_elements = N * C_out * H_out * W_out;
+  int64_t ops_per_output = C_in * K_h * K_w;
+
+  int64_t flop = output_elements * (ops_per_output);
+
+  return flop;
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout
+      << "Quantized Conv2d Operation with Output Quantization Prototyping Framework"
+      << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  // Execute test cases using the new framework with custom FLOP calculator
+  auto results = execute_test_cases(
+      generate_quantized_conv2d_test_cases,
+      quantized_conv2d_flop_calculator,
+      "QuantizedConv2dQ8ToQ8To",
+      0,
+      10,
+      ref_fn);
+
+  return 0;
+}
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
new file mode 100644
index 00000000000..c259b45de06
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/q8ta_q8csw_q8to_conv2d_dw.cpp
@@ -0,0 +1,592 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <iostream>
+#include <vector>
+#include "conv2d_utils.h"
+#include "utils.h"
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+using namespace executorch::vulkan::prototyping;
+
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 100;
+
+// Utility function to create a test case from a Conv2dConfig for depthwise
+// convolution
+TestCase create_test_case_from_config(
+    const Conv2dConfig& config,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype) {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  std::string storage_str =
+      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
+  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
+
+  std::string test_name =
+      config.test_case_name + "_" + storage_str + "_" + dtype_str;
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  std::string operator_name = "etvk." + config.op_name + ".test";
+  test_case.set_operator_name(operator_name);
+
+  // Calculate output dimensions
+  int64_t H_out = config.get_output_height();
+  int64_t W_out = config.get_output_width();
+
+  // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1)
+  std::vector<int64_t> input_size = {
+      1, config.channels.in, config.input_size.h, config.input_size.w};
+
+  ValueSpec input_tensor(
+      input_size,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::RANDOM);
+
+  if (debugging()) {
+    print_valuespec_data(input_tensor, "input_tensor", false, 64);
+  }
+
+  float input_scale_val = 0.008123;
+  ValueSpec input_scale(input_scale_val);
+
+  int32_t input_zero_point_val = 2;
+  ValueSpec input_zero_point(input_zero_point_val);
+
+  // Quantized weight tensor (int8) for depthwise convolution
+  // Memory layout: [K_h, K_w, OC]
+  // For depthwise conv: groups = channels.out, in_channels_per_group = 1
+  std::vector<int64_t> weight_size = {
+      config.kernel.h, config.kernel.w, config.channels.out};
+  ValueSpec quantized_weight(
+      weight_size,
+      vkapi::kChar, // int8 for quantized weights
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::RANDINT8);
+  quantized_weight.set_constant(true);
+
+  if (debugging()) {
+    print_valuespec_data(quantized_weight, "weight_tensor", false, 64);
+  }
+
+  // Weight quantization scales (float/half, per-channel)
+  ValueSpec weight_scales(
+      {config.channels.out}, // Per output channel
+      input_dtype,
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::RANDOM_SCALES);
+  weight_scales.set_constant(true);
+
+  ValueSpec weight_sums(
+      {config.channels.out}, // Per output channel
+      vkapi::kInt,
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::ZEROS);
+  weight_sums.set_constant(true);
+
+  // Compute weight_sums data based on quantized weights for depthwise layout
+  // For depthwise conv: each output channel has K_h * K_w weights
+  // Custom computation for depthwise layout [K_h, K_w, OC]
+  auto& weight_sums_data = weight_sums.get_int32_data();
+  auto& quantized_weight_data = quantized_weight.get_int8_data();
+
+  weight_sums_data.resize(config.channels.out);
+
+  for (int64_t out_c = 0; out_c < config.channels.out; ++out_c) {
+    int32_t sum = 0;
+    for (int64_t kh = 0; kh < config.kernel.h; ++kh) {
+      for (int64_t kw = 0; kw < config.kernel.w; ++kw) {
+        // Weight indexing for depthwise layout [K_h, K_w, OC]
+        int64_t weight_idx = kh * (config.kernel.w * config.channels.out) +
+            kw * config.channels.out + out_c;
+        sum += static_cast<int32_t>(quantized_weight_data[weight_idx]);
+      }
+    }
+    weight_sums_data[out_c] = sum;
+  }
+
+  // Bias (optional, float/half) - [C_out]
+  ValueSpec bias(
+      {config.channels.out}, // Per output channel
+      input_dtype,
+      storage_type,
+      utils::kWidthPacked,
+      DataGenType::RANDOM);
+  bias.set_constant(true);
+
+  // Output quantization parameters
+  float output_scale_val = 0.05314;
+  ValueSpec output_scale(output_scale_val);
+
+  int32_t output_zero_point_val = -1;
+  ValueSpec output_zero_point(output_zero_point_val);
+
+  // Stride and padding parameters
+  ValueSpec stride({config.stride.h, config.stride.w});
+  ValueSpec padding({config.padding.h, config.padding.w});
+
+  // Dilation and groups parameters
+  ValueSpec dilation({config.dilation.h, config.dilation.w});
+  ValueSpec groups(config.groups);
+
+  // Kernel size parameters
+  ValueSpec kernel_size({config.kernel.h, config.kernel.w});
+
+  // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1)
+  ValueSpec output(
+      {1, config.channels.out, H_out, W_out},
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case for q8ta_q8csw_q8to operation
+  test_case.add_input_spec(input_tensor);
+  test_case.add_input_spec(input_scale);
+  test_case.add_input_spec(input_zero_point);
+  test_case.add_input_spec(quantized_weight);
+  test_case.add_input_spec(weight_sums);
+  test_case.add_input_spec(weight_scales);
+  test_case.add_input_spec(output_scale);
+  test_case.add_input_spec(output_zero_point);
+  test_case.add_input_spec(bias);
+  test_case.add_input_spec(kernel_size);
+  test_case.add_input_spec(stride);
+  test_case.add_input_spec(padding);
+  test_case.add_input_spec(dilation);
+  test_case.add_input_spec(groups);
+
+  test_case.add_output_spec(output);
+
+  test_case.set_abs_tolerance(output_scale_val + 1e-4f);
+
+  return test_case;
+}
+
+// Generate easy test cases for quantized depthwise conv2d operation (for
+// debugging)
+std::vector<TestCase> generate_quantized_conv2d_dw_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Single simple configuration for debugging - depthwise convolution
+  Conv2dConfig config = {
+      OutInChannels(8, 8), // channels (out, in) - equal for depthwise
+      InputSize2D(8, 8), // input_size (h, w)
+      KernelSize(3, 3), // kernel
+      Stride(2, 2), // stride
+      Padding(1, 1), // padding
+      Dilation(1, 1), // dilation
+      8, // groups = channels.out for depthwise
+  };
+  config.op_name = "conv2d_q8ta_q8csw_q8to";
+
+  // Test with both storage types and data types for completeness
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination
+  for (const auto& storage_type : storage_types) {
+    for (const auto& input_dtype : float_types) {
+      test_cases.push_back(
+          create_test_case_from_config(config, storage_type, input_dtype));
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for quantized depthwise conv2d operation
+std::vector<TestCase> generate_quantized_conv2d_dw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<Conv2dConfig> configs = {
+      // Depthwise convolutions: groups = channels.out, channels.in =
+      // channels.out
+      {OutInChannels(32, 32),
+       InputSize2D(64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       32},
+      {OutInChannels(64, 64),
+       InputSize2D(32, 32),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(2, 2),
+       Dilation(1, 1),
+       64},
+      {OutInChannels(64, 64),
+       InputSize2D(32, 32),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       64},
+      {OutInChannels(80, 80),
+       InputSize2D(16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       80},
+      {OutInChannels(16, 16),
+       InputSize2D(57, 33),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       16},
+      // Different kernel sizes for depthwise
+      {OutInChannels(32, 32),
+       InputSize2D(64, 64),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       32},
+      {OutInChannels(96, 96),
+       InputSize2D(64, 64),
+       KernelSize(5, 5),
+       Stride(2, 2),
+       Padding(2, 2),
+       Dilation(1, 1),
+       96},
+      // Performance cases
+      {OutInChannels(128, 128),
+       InputSize2D(128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       128},
+      {OutInChannels(64, 64),
+       InputSize2D(256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       64},
+      {OutInChannels(288, 288),
+       InputSize2D(16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       288},
+      {OutInChannels(32, 32),
+       InputSize2D(128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       32}};
+
+  // Test with different storage types and data types
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+
+  // Generate test cases for each combination
+  for (auto& config : configs) {
+    for (const auto& storage_type : storage_types) {
+      // Generate test case name programmatically
+      bool is_performance = config.channels.out > kRefDimSizeLimit ||
+          config.channels.in > kRefDimSizeLimit ||
+          config.input_size.h > kRefDimSizeLimit ||
+          config.input_size.w > kRefDimSizeLimit;
+      std::string prefix =
+          is_performance ? "performance_dw_" : "correctness_dw_";
+      std::string suffix = std::to_string(config.channels.out) + "/" +
+          std::to_string(config.channels.in) + "_" +
+          std::to_string(config.input_size.h) + "/" +
+          std::to_string(config.input_size.w) + "_" +
+          std::to_string(config.kernel.h) + "/" +
+          std::to_string(config.kernel.w);
+
+      config.op_name = "conv2d_q8ta_q8csw_q8to";
+      config.test_case_name = prefix + suffix;
+
+      // Only test q8ta_q8csw_q8to if the int8 dot product extension is
+      // supported
+      if (vkcompute::api::context()
+              ->adapter_ptr()
+              ->supports_int8_dot_product()) {
+        test_cases.push_back(
+            create_test_case_from_config(config, storage_type, vkapi::kFloat));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for activation, weight, and output quantized
+// depthwise conv2d
+void conv2d_q8ta_q8csw_q8to_dw_reference_impl(TestCase& test_case) {
+  // Extract input specifications
+  int32_t idx = 0;
+  const ValueSpec& input_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_zeros_spec = test_case.inputs()[idx++];
+  const ValueSpec& weight_spec = test_case.inputs()[idx++];
+  const ValueSpec& weight_sums_spec = test_case.inputs()[idx++];
+  (void)weight_sums_spec;
+  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_zeros_spec = test_case.inputs()[idx++];
+  const ValueSpec& bias_spec = test_case.inputs()[idx++];
+  const ValueSpec& kernel_size_spec = test_case.inputs()[idx++];
+  const ValueSpec& stride_spec = test_case.inputs()[idx++];
+  const ValueSpec& padding_spec = test_case.inputs()[idx++];
+  const ValueSpec& dilation_spec = test_case.inputs()[idx++];
+  const ValueSpec& groups_spec = test_case.inputs()[idx++];
+
+  // Extract output specification (mutable reference)
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions
+  auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in]
+  auto weight_sizes =
+      weight_spec.get_tensor_sizes(); // [K_h, align_up_4(K_w), OC]
+  auto output_sizes =
+      output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out]
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H_in = input_sizes[2];
+  int64_t W_in = input_sizes[3];
+  int64_t C_out = output_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  // Get kernel dimensions from kernel_size ValueSpec
+  auto kernel_size_data = kernel_size_spec.get_int32_data();
+  int64_t K_h = kernel_size_data[0];
+  int64_t K_w = kernel_size_data[1];
+
+  // Get stride, padding, dilation, and groups
+  auto stride_data = stride_spec.get_int32_data();
+  auto padding_data = padding_spec.get_int32_data();
+  auto dilation_data = dilation_spec.get_int32_data();
+  int64_t stride_h = stride_data[0];
+  int64_t stride_w = stride_data[1];
+  int64_t pad_h = padding_data[0];
+  int64_t pad_w = padding_data[1];
+  int64_t dilation_h = dilation_data[0];
+  int64_t dilation_w = dilation_data[1];
+  int64_t groups = groups_spec.get_int_value();
+
+  // Skip for large tensors since computation time will be extremely slow
+  if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit ||
+      H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit ||
+      C_out > kRefDimSizeLimit) {
+    throw std::invalid_argument(
+        "One or more dimensions exceed the allowed limit for reference implementation.");
+  }
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Verify this is a depthwise convolution
+  if (groups != C_out || C_in != C_out) {
+    throw std::invalid_argument(
+        "This is not a depthwise convolution configuration");
+  }
+
+  // Get raw data pointers
+  auto& input_data = input_spec.get_float_data();
+  const float input_scale = input_scale_spec.get_float_value();
+  const int32_t input_zero_point = input_zeros_spec.get_int_value();
+
+  auto& weight_data = weight_spec.get_int8_data();
+  auto& weight_scales_data = weight_scales_spec.get_float_data();
+  auto& bias_data = bias_spec.get_float_data();
+
+  const float output_scale = output_scale_spec.get_float_value();
+  const int32_t output_zero_point = output_zeros_spec.get_int_value();
+
+  // Calculate number of output elements
+  int64_t num_output_elements = N * C_out * H_out * W_out;
+
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(num_output_elements);
+
+  // Perform activation, weight, and output quantized depthwise conv2d operation
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t out_c = 0; out_c < C_out; ++out_c) {
+      for (int64_t out_h = 0; out_h < H_out; ++out_h) {
+        for (int64_t out_w = 0; out_w < W_out; ++out_w) {
+          int32_t int_sum = 0;
+          int32_t weight_sum = 0; // Track weight sum on the fly
+
+          // For depthwise convolution, each output channel corresponds to one
+          // input channel
+          int64_t in_c = out_c;
+
+          // Convolution operation with integer accumulation
+          for (int64_t kh = 0; kh < K_h; ++kh) {
+            for (int64_t kw = 0; kw < K_w; ++kw) {
+              // Calculate input position with dilation
+              int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h;
+              int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w;
+
+              // Check bounds (zero padding)
+              if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) {
+                // Get input value and quantize to int8
+                int64_t input_idx = n * (C_in * H_in * W_in) +
+                    in_c * (H_in * W_in) + in_h * W_in + in_w;
+
+                float quant_input_f =
+                    std::round(input_data[input_idx] / input_scale) +
+                    input_zero_point;
+                quant_input_f =
+                    std::min(std::max(quant_input_f, -128.0f), 127.0f);
+                int8_t quantized_input = static_cast<int8_t>(quant_input_f);
+
+                // Get quantized weight using depthwise layout [K_h, K_w, OC]
+                int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c;
+                int8_t quantized_weight = weight_data[weight_idx];
+
+                if (false && in_w == 0 && in_h == 0 && out_c == 0) {
+                  std::cout << "input: " << input_data[input_idx] << std::endl;
+                  std::cout << "quantized_input: " << (int)quantized_input
+                            << std::endl;
+                  std::cout << "quantized_weight: " << (int)quantized_weight
+                            << std::endl;
+                }
+                // Integer multiplication and accumulation
+                int_sum += static_cast<int32_t>(quantized_input) *
+                    static_cast<int32_t>(quantized_weight);
+
+                // Track weight sum for this output channel on the fly
+                weight_sum += static_cast<int32_t>(quantized_weight);
+              } else {
+                // For zero padding, we still need to account for the weight
+                // in weight_sum when input is effectively 0 (but quantized 0
+                // is input_zero_point)
+                int64_t weight_idx = kh * (K_w * C_out) + kw * C_out + out_c;
+                int8_t quantized_weight = weight_data[weight_idx];
+
+                // Add contribution from zero-padded input (quantized zero =
+                // input_zero_point)
+                int_sum += static_cast<int32_t>(input_zero_point) *
+                    static_cast<int32_t>(quantized_weight);
+
+                // Track weight sum for this output channel on the fly
+                weight_sum += static_cast<int32_t>(quantized_weight);
+              }
+            }
+          }
+
+          // Convert accumulated integer result to float and apply scales
+          // Final result = (int_sum - zero_point_correction) * input_scale *
+          // weight_scale + bias zero_point_correction = input_zero_point *
+          // sum_of_weights_for_this_output_channel
+          int32_t zero_point_correction = input_zero_point * weight_sum;
+          int32_t accum_adjusted = int_sum - zero_point_correction;
+          float float_result =
+              accum_adjusted * input_scale * weight_scales_data[out_c];
+
+          // Add bias and store result
+          float_result += bias_data[out_c];
+
+          // Quantize the output to int8
+          float quant_output_f =
+              std::round(float_result / output_scale) + output_zero_point;
+          quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f);
+          int8_t quantized_output = static_cast<int8_t>(quant_output_f);
+
+          if (false && out_c < 4 && out_h < 1 && out_w < 4) {
+            std::cout << "int_sum[" << out_c << ", " << out_h << ", " << out_w
+                      << "] = " << int_sum << ", " << float_result << ", "
+                      << output_scale << ", " << quant_output_f << std::endl;
+          }
+
+          // Dequantize back to float
+          float dequant_output =
+              (static_cast<float>(quantized_output) - output_zero_point) *
+              output_scale;
+
+          int64_t output_idx = n * (C_out * H_out * W_out) +
+              out_c * (H_out * W_out) + out_h * W_out + out_w;
+          ref_data[output_idx] = dequant_output;
+        }
+      }
+    }
+  }
+}
+
+void reference_impl(TestCase& test_case) {
+  conv2d_q8ta_q8csw_q8to_dw_reference_impl(test_case);
+}
+
+// Custom FLOP calculator for quantized depthwise conv2d operation
+int64_t quantized_conv2d_dw_flop_calculator(const TestCase& test_case) {
+  int kernel_idx = 9; // kernel_size is at index 9 for q8ta_q8csw_q8to
+
+  // Get input and weight dimensions
+  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data();
+
+  int64_t N = input_sizes[0];
+  int64_t C_out = output_sizes[1];
+  int64_t K_h = kernel_sizes[0];
+  int64_t K_w = kernel_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  // Calculate FLOPs for quantized depthwise conv2d operation
+  // Each output element requires:
+  // - K_h * K_w multiply-accumulate operations (only one input channel per
+  // output channel)
+  // - Additional operations for quantization/dequantization
+  int64_t output_elements = N * C_out * H_out * W_out;
+  int64_t ops_per_output = K_h * K_w;
+
+  int64_t flop = output_elements * ops_per_output;
+
+  return flop;
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout
+      << "Quantized Depthwise Conv2d Operation with Output Quantization Prototyping Framework"
+      << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  // Execute test cases using the new framework with custom FLOP calculator
+  auto results = execute_test_cases(
+      generate_quantized_conv2d_dw_test_cases,
+      quantized_conv2d_dw_flop_calculator,
+      "QuantizedDepthwiseInt8Conv2d",
+      0,
+      1,
+      ref_fn);
+
+  return 0;
+}
diff --git a/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
new file mode 100644
index 00000000000..5799bc194c9
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add.cpp
@@ -0,0 +1,265 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <iostream>
+#include <vector>
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+
+// Utility function to create a test case for quantized add operation
+TestCase create_quantized_add_test_case(
+    const std::vector<int64_t>& sizes,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype) {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  std::string size_str = "";
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    size_str += std::to_string(sizes[i]);
+    if (i < sizes.size() - 1)
+      size_str += "x";
+  }
+
+  std::string storage_str =
+      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
+  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
+
+  std::string test_name =
+      "QuantizedAdd_" + size_str + "_" + storage_str + "_" + dtype_str;
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  test_case.set_operator_name("et_vk.add_q8ta_q8ta_q8to.test");
+
+  // Input tensor A (float/half)
+  ValueSpec input_a(
+      sizes,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::RANDOM);
+
+  // Input tensor B (float/half)
+  ValueSpec input_b(
+      sizes,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::RANDOM);
+
+  // Quantization parameters for input A
+  float input_a_scale_val = 0.007843; // 2/255 approximately
+  ValueSpec input_a_scale(input_a_scale_val);
+
+  int32_t input_a_zero_point_val = 3;
+  ValueSpec input_a_zero_point(input_a_zero_point_val);
+
+  // Quantization parameters for input B
+  float input_b_scale_val = 0.009412; // 2.4/255 approximately
+  ValueSpec input_b_scale(input_b_scale_val);
+
+  int32_t input_b_zero_point_val = -2;
+  ValueSpec input_b_zero_point(input_b_zero_point_val);
+
+  // Output quantization parameters
+  float output_scale_val = 0.015686; // 4/255 approximately
+  ValueSpec output_scale(output_scale_val);
+
+  int32_t output_zero_point_val = 1;
+  ValueSpec output_zero_point(output_zero_point_val);
+
+  // Alpha parameter
+  float alpha_val = 1.0f;
+  ValueSpec alpha(alpha_val);
+
+  // Output tensor (float/half)
+  ValueSpec output(
+      sizes,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case for q8ta_q8ta_q8to add operation
+  test_case.add_input_spec(input_a);
+  test_case.add_input_spec(input_b);
+  test_case.add_input_spec(input_a_scale);
+  test_case.add_input_spec(input_a_zero_point);
+  test_case.add_input_spec(input_b_scale);
+  test_case.add_input_spec(input_b_zero_point);
+  test_case.add_input_spec(output_scale);
+  test_case.add_input_spec(output_zero_point);
+  test_case.add_input_spec(alpha);
+
+  test_case.add_output_spec(output);
+
+  test_case.set_abs_tolerance(output_scale_val + 1e-4f);
+
+  return test_case;
+}
+
+// Generate test cases for quantized add operation
+std::vector<TestCase> generate_quantized_add_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Define different input size configurations
+  std::vector<std::vector<int64_t>> size_configs = {
+      {3, 32, 32}, // Small square
+      {8, 64, 64}, // Medium square
+      {16, 16, 16}, // 3D cube
+      {8, 32, 16}, // 3D rectangular
+      {7, 7, 13}, // Irregular sizes
+  };
+
+  // Storage types to test
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+
+  // Data types to test
+  std::vector<vkapi::ScalarType> data_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination
+  for (const auto& sizes : size_configs) {
+    for (const auto& storage_type : storage_types) {
+      for (const auto& data_type : data_types) {
+        test_cases.push_back(
+            create_quantized_add_test_case(sizes, storage_type, data_type));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for quantized add operation
+void add_q8ta_q8ta_q8to_reference_impl(TestCase& test_case) {
+  // Extract input specifications
+  int32_t idx = 0;
+  const ValueSpec& input_a_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_b_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_a_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_a_zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_b_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& input_b_zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& output_zero_point_spec = test_case.inputs()[idx++];
+  const ValueSpec& alpha_spec = test_case.inputs()[idx++];
+
+  // Extract output specification (mutable reference)
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions
+  auto input_sizes = input_a_spec.get_tensor_sizes();
+  int64_t num_elements = input_a_spec.numel();
+
+  if (input_a_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Get raw data pointers
+  auto& input_a_data = input_a_spec.get_float_data();
+  auto& input_b_data = input_b_spec.get_float_data();
+
+  const float input_a_scale = input_a_scale_spec.get_float_value();
+  const int32_t input_a_zero_point = input_a_zero_point_spec.get_int_value();
+  const float input_b_scale = input_b_scale_spec.get_float_value();
+  const int32_t input_b_zero_point = input_b_zero_point_spec.get_int_value();
+  const float output_scale = output_scale_spec.get_float_value();
+  const int32_t output_zero_point = output_zero_point_spec.get_int_value();
+  const float alpha = alpha_spec.get_float_value();
+
+  auto& ref_data = output_spec.get_ref_float_data();
+  ref_data.resize(num_elements);
+
+  // Perform quantized add operation
+  for (int64_t i = 0; i < num_elements; ++i) {
+    // Quantize input A to int8
+    float quant_a_f =
+        std::round(input_a_data[i] / input_a_scale) + input_a_zero_point;
+    quant_a_f = std::min(std::max(quant_a_f, -128.0f), 127.0f);
+    int8_t quantized_a = static_cast<int8_t>(quant_a_f);
+
+    // Quantize input B to int8
+    float quant_b_f =
+        std::round(input_b_data[i] / input_b_scale) + input_b_zero_point;
+    quant_b_f = std::min(std::max(quant_b_f, -128.0f), 127.0f);
+    int8_t quantized_b = static_cast<int8_t>(quant_b_f);
+
+    // Dequantize both inputs to a common scale for addition
+    float dequant_a =
+        (static_cast<float>(quantized_a) - input_a_zero_point) * input_a_scale;
+    float dequant_b =
+        (static_cast<float>(quantized_b) - input_b_zero_point) * input_b_scale;
+
+    // Perform addition in float space with alpha
+    float float_result = dequant_a + alpha * dequant_b;
+
+    // Quantize the result to int8
+    float quant_output_f =
+        std::round(float_result / output_scale) + output_zero_point;
+    quant_output_f = std::min(std::max(quant_output_f, -128.0f), 127.0f);
+    int8_t quantized_output = static_cast<int8_t>(quant_output_f);
+
+    // Dequantize back to float for comparison
+    float dequant_output =
+        (static_cast<float>(quantized_output) - output_zero_point) *
+        output_scale;
+
+    ref_data[i] = dequant_output;
+  }
+}
+
+void reference_impl(TestCase& test_case) {
+  add_q8ta_q8ta_q8to_reference_impl(test_case);
+}
+
+// Custom FLOP calculator for quantized add operation
+int64_t quantized_add_flop_calculator(const TestCase& test_case) {
+  // Calculate total elements from the first input tensor
+  int64_t total_elements = 1;
+  if (!test_case.empty() && test_case.num_inputs() > 0 &&
+      test_case.inputs()[0].is_tensor()) {
+    const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
+    for (int64_t size : sizes) {
+      total_elements *= size;
+    }
+  }
+
+  // Quantized add operation includes:
+  // - 2 quantizations (float to int8)
+  // - 2 dequantizations (int8 to float)
+  // - 1 addition
+  // For simplicity, we count this as 1 FLOP per element (the addition)
+  return total_elements;
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Quantized Add Operation (q8ta_q8ta_q8to) Prototyping Framework"
+            << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  // Execute test cases using the new framework with custom FLOP calculator
+  auto results = execute_test_cases(
+      generate_quantized_add_test_cases,
+      quantized_add_flop_calculator,
+      "QuantizedAddQ8taQ8taQ8to",
+      0,
+      1,
+      ref_fn);
+
+  return 0;
+}
diff --git a/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp b/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp
new file mode 100644
index 00000000000..5275e6c9335
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations.cpp
@@ -0,0 +1,251 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include "utils.h"
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 512;
+
+// QDQ8TA Conv2D configuration struct for 4D tensor quantize-dequantize testing
+struct QDQ8TAConv2DConfig {
+  int64_t batch_size; // N dimension
+  int64_t in_channels; // C dimension
+  int64_t height; // H dimension
+  int64_t width; // W dimension
+  std::string test_case_name = "placeholder";
+  std::string op_name = "qdq8ta_conv2d_input";
+};
+
+// Utility function to create a test case from a QDQ8TAConv2DConfig
+TestCase create_test_case_from_config(
+    const QDQ8TAConv2DConfig& config,
+    utils::StorageType storage_type,
+    vkapi::ScalarType input_dtype) {
+  TestCase test_case;
+
+  // Create a descriptive name for the test case
+  std::string storage_str =
+      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
+  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
+
+  std::string test_name =
+      config.test_case_name + "_" + storage_str + "_" + dtype_str;
+  test_case.set_name(test_name);
+
+  // Set the operator name for the test case
+  std::string operator_name = "etvk." + config.op_name + ".default";
+  test_case.set_operator_name(operator_name);
+
+  // Input tensor (float) - [N, C, H, W]
+  std::vector<int64_t> input_size = {
+      config.batch_size, config.in_channels, config.height, config.width};
+  ValueSpec input_tensor(
+      input_size,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked, // Use channels packed for conv2d tensors
+      DataGenType::RANDOM);
+
+  float scale_val = 0.007112;
+  ValueSpec scale(scale_val);
+
+  // Generate random zero point within quantization range
+  int32_t zero_point_val = -2;
+  ValueSpec zero_point(zero_point_val);
+
+  // Output tensor (float) - same shape as input [N, C, H, W]
+  ValueSpec output_tensor(
+      input_size,
+      input_dtype,
+      storage_type,
+      utils::kChannelsPacked,
+      DataGenType::ZEROS);
+
+  // Add all specs to test case
+  test_case.add_input_spec(input_tensor);
+  test_case.add_input_spec(scale);
+  test_case.add_input_spec(zero_point);
+  test_case.add_output_spec(output_tensor);
+
+  test_case.set_abs_tolerance(scale_val + 1e-4);
+
+  return test_case;
+}
+
+// Generate easy test cases for qdq8ta_conv2d operation (for debugging)
+std::vector<TestCase> generate_qdq8ta_conv2d_easy_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Single simple configuration for debugging
+  QDQ8TAConv2DConfig config = {
+      1, // batch_size
+      3, // in_channels
+      4, // height
+      4, // width
+      "simple", // test_case_name
+  };
+
+  // Test with both storage types
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
+
+  // Generate test cases for each combination
+  for (const auto& storage_type : storage_types) {
+    for (const auto& input_dtype : float_types) {
+      test_cases.push_back(
+          create_test_case_from_config(config, storage_type, input_dtype));
+    }
+  }
+
+  return test_cases;
+}
+
+// Generate test cases for qdq8ta_conv2d operation
+std::vector<TestCase> generate_qdq8ta_conv2d_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<QDQ8TAConv2DConfig> configs = {
+      // Small test cases for correctness
+      {1, 3, 16, 16},
+      {1, 8, 32, 32},
+      {1, 16, 24, 24},
+      {1, 32, 12, 12},
+      {1, 1, 64, 64},
+      {1, 3, 64, 64},
+      {1, 4, 16, 16},
+
+      // Different tensor sizes
+      {1, 8, 20, 20},
+      {1, 16, 14, 14},
+      {1, 8, 28, 28},
+
+      // Odd tensor sizes
+      {1, 3, 15, 15},
+      {1, 13, 31, 31},
+      {1, 17, 23, 23},
+
+      // Performance test cases (larger tensors)
+      {1, 64, 128, 128},
+      {1, 32, 64, 64},
+      {1, 128, 56, 56},
+  };
+
+  // Test with different storage types
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+
+  for (auto config : configs) {
+    std::string prefix =
+        (config.batch_size < kRefDimSizeLimit &&
+         config.in_channels < kRefDimSizeLimit &&
+         config.height < kRefDimSizeLimit && config.width < kRefDimSizeLimit)
+        ? "correctness_"
+        : "performance_";
+    std::string generated_test_case_name = prefix +
+        std::to_string(config.batch_size) + "_" +
+        std::to_string(config.in_channels) + "_" +
+        std::to_string(config.height) + "_" + std::to_string(config.width);
+
+    config.test_case_name = generated_test_case_name;
+
+    for (const auto& storage_type : storage_types) {
+      test_cases.push_back(
+          create_test_case_from_config(config, storage_type, vkapi::kFloat));
+    }
+  }
+
+  return test_cases;
+}
+
+// Reference implementation for qdq8ta_conv2d operation
+void qdq8ta_conv2d_reference_impl(TestCase& test_case) {
+  int32_t idx = 0;
+  const ValueSpec& input_spec = test_case.inputs()[idx++];
+  const ValueSpec& scale_spec = test_case.inputs()[idx++];
+  const ValueSpec& zero_point_spec = test_case.inputs()[idx++];
+
+  // Extract output specification
+  ValueSpec& output_spec = test_case.outputs()[0];
+
+  // Get tensor dimensions
+  auto input_sizes = input_spec.get_tensor_sizes(); // [N, C, H, W]
+  int64_t N = input_sizes[0];
+  int64_t C = input_sizes[1];
+  int64_t H = input_sizes[2];
+  int64_t W = input_sizes[3];
+
+  // Skip for large tensors since computation time will be extremely slow
+  if (N > kRefDimSizeLimit || C > kRefDimSizeLimit || H > kRefDimSizeLimit ||
+      W > kRefDimSizeLimit) {
+    throw std::invalid_argument(
+        "One or more dimensions (N, C, H, W) exceed the allowed limit for reference implementation.");
+  }
+
+  if (input_spec.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Unsupported dtype");
+  }
+
+  // Get raw data pointers
+  auto& input_data = input_spec.get_float_data();
+
+  // Extract the randomized scale and zero point values (following
+  // q8csw_conv2d.cpp pattern)
+  float scale = scale_spec.get_float_value();
+  int32_t zero_point = zero_point_spec.get_int_value();
+  int32_t quant_min = -128;
+  int32_t quant_max = 127;
+
+  // Prepare output data
+  auto& ref_data = output_spec.get_ref_float_data();
+  int64_t num_elements = N * C * H * W;
+  ref_data.resize(num_elements);
+
+  // Perform quantize-dequantize operation on each element
+  for (int64_t i = 0; i < num_elements; ++i) {
+    float input_val = input_data[i];
+
+    // Quantize: quantized = round(input / scale + zero_point)
+    float quantized_float = std::round(input_val / scale) + zero_point;
+
+    // Clamp to quantization range
+    quantized_float = std::max(quantized_float, static_cast<float>(quant_min));
+    quantized_float = std::min(quantized_float, static_cast<float>(quant_max));
+
+    int32_t quantized_int = static_cast<int32_t>(quantized_float);
+
+    // Dequantize: output = (quantized - zero_point) * scale
+    float dequantized = (quantized_int - zero_point) * scale;
+
+    ref_data[i] = dequantized;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "QDQ8TA Conv2D Operation Prototyping Framework" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = qdq8ta_conv2d_reference_impl;
+
+  auto results = execute_test_cases(
+      generate_qdq8ta_conv2d_test_cases, "QDQ8TAConv2D", 0, 1, ref_fn);
+
+  return 0;
+}
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 3162857c2d3..4ef1cdd7fed 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -60,9 +60,11 @@ def define_common_targets(is_fbcode = False):
         ],
         headers = [
             "utils.h",
+            "conv2d_utils.h",
         ],
         exported_headers = [
             "utils.h",
+            "conv2d_utils.h",
         ],
         platforms = get_platforms(),
         deps = [
@@ -97,3 +99,7 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("q8csw_conv2d")
     define_custom_op_test_binary("choose_qparams_per_row")
     define_custom_op_test_binary("q4gsw_linear")
+    define_custom_op_test_binary("qdq8ta_conv2d_activations")
+    define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d")
+    define_custom_op_test_binary("q8ta_q8csw_q8to_conv2d_dw")
+    define_custom_op_test_binary("q8ta_q8ta_q8to_add")
diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
index 2aa827a4d5a..4de6c32ac25 100644
--- a/backends/vulkan/test/custom_ops/utils.cpp
+++ b/backends/vulkan/test/custom_ops/utils.cpp
@@ -661,7 +661,12 @@ float collect_gpu_timing_us(ComputeGraph& graph) {
     float total_duration_us = 0.0f;
     for (const auto& shader_result : results) {
       if (shader_result.kernel_name.find("nchw_to") == std::string::npos &&
-          shader_result.kernel_name.find("to_nchw") == std::string::npos) {
+          shader_result.kernel_name.find("to_nchw") == std::string::npos &&
+          shader_result.kernel_name.find(
+              "quantize_and_pack_q8ta_conv2d_input") == std::string::npos &&
+          shader_result.kernel_name.find(
+              "unpack_and_dequantize_q8ta_conv2d_output") ==
+              std::string::npos) {
         // Calculate duration from start and end times, convert from ns to μs
         uint64_t duration_ns =
             shader_result.end_time_ns - shader_result.start_time_ns;
@@ -1715,6 +1720,41 @@ void compute_weight_sums(
   }
 }
 
+// Compute weight sums for 4D quantized conv2d operations
+// Weight layout: [C_out, K_h, K_w, align_up_4(C_in_per_group)]
+void compute_weight_sums_4d(
+    ValueSpec& weight_sums,
+    const ValueSpec& quantized_weight,
+    int64_t out_channels,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t aligned_in_channels) {
+  auto& weight_sums_data = weight_sums.get_int32_data();
+  auto& quantized_weight_data = quantized_weight.get_int8_data();
+
+  weight_sums_data.resize(out_channels);
+
+  // For each output channel, compute the sum of quantized weights
+  for (int64_t out_c = 0; out_c < out_channels; ++out_c) {
+    int32_t sum = 0;
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        for (int64_t in_c = 0; in_c < aligned_in_channels; ++in_c) {
+          // Weight indexing: [out_c, kh, kw, in_c]
+          int64_t weight_idx =
+              out_c * (kernel_h * kernel_w * aligned_in_channels) +
+              kh * (kernel_w * aligned_in_channels) + kw * aligned_in_channels +
+              in_c;
+          sum += static_cast<int32_t>(quantized_weight_data[weight_idx]);
+        }
+      }
+    }
+
+    weight_sums_data[out_c] = sum;
+  }
+}
+
 // Helper function to unpack 4-bit values from uint8 (same as in
 // q4gsw_linear.cpp)
 std::pair<int8_t, int8_t> unpack_4bit_utils(uint8_t packed) {
diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h
index f1736f1d144..b80f28639e8 100644
--- a/backends/vulkan/test/custom_ops/utils.h
+++ b/backends/vulkan/test/custom_ops/utils.h
@@ -653,6 +653,16 @@ void compute_weight_sums(
     int64_t out_features,
     int64_t elements_per_output_feature);
 
+// Compute weight sums for 4D quantized conv2d operations
+// Weight layout: [C_out, K_h, K_w, align_up_4(C_in_per_group)]
+void compute_weight_sums_4d(
+    ValueSpec& weight_sums,
+    const ValueSpec& quantized_weight,
+    int64_t out_channels,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t aligned_in_channels);
+
 // Compute weight sums for 4-bit group symmetric quantized weights
 void compute_weight_sums_4bit_grouped(
     ValueSpec& weight_sums,
diff --git a/backends/vulkan/test/scripts/test_model.sh b/backends/vulkan/test/scripts/test_model.sh
index 5f06d2c039b..40ec88bae70 100755
--- a/backends/vulkan/test/scripts/test_model.sh
+++ b/backends/vulkan/test/scripts/test_model.sh
@@ -111,6 +111,7 @@ build_core_libraries_and_devtools() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
diff --git a/backends/vulkan/test/scripts/test_op.sh b/backends/vulkan/test/scripts/test_op.sh
index 1ec07b7f75f..797089e54dc 100755
--- a/backends/vulkan/test/scripts/test_op.sh
+++ b/backends/vulkan/test/scripts/test_op.sh
@@ -138,6 +138,7 @@ build_core_libraries() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index f8194f0b32c..f92cea64767 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -2482,6 +2482,7 @@ def forward(self, x):
             rtol=1e-1,
         )
 
+    @unittest.skip("Cannot run on swiftshader due to no integer dot product support")
     def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence(self):
         """
         Test a sequence of convolution layers quantized with PT2E quantization.
@@ -2572,6 +2573,7 @@ def forward(self, x):
             rtol=1e-1,
         )
 
+    @unittest.skip("Cannot run on swiftshader due to no integer dot product support")
     def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence_all_reduced(self):
         """
         Test a sequence of convolution layers quantized with PT2E quantization.
diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py
index 4a30ab6c2de..438126a179f 100644
--- a/backends/vulkan/test/test_vulkan_passes.py
+++ b/backends/vulkan/test/test_vulkan_passes.py
@@ -3,15 +3,8 @@
 
 import torch
 
-from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform
 from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
 
-from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
-    get_symmetric_quantization_config,
-    VulkanQuantizer,
-)
-
 from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
 
 from executorch.exir.backend.canonical_partitioners.config_partitioner import (
@@ -94,66 +87,6 @@ def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) ->
 
 
 class TestVulkanPasses(unittest.TestCase):
-    def test_fuse_int8pack_mm(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=8)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
-    def test_fuse_linear_qcs4w(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=4)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
     def test_fuse_rotary_emb(self):
         """Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op."""
 
@@ -238,7 +171,8 @@ def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
 
         # Apply the rotary embedding pass
         ep = edge_manager._edge_programs["forward"]
-        rotary_pass = FusePatternsPass(ep)
+        rotary_pass = FusePatternsPass()
+        rotary_pass._exported_program = ep
         result = rotary_pass.call(ep.graph_module)
 
         # Verify that the pass was successful
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
index 41c1d92bd00..a887c53473a 100644
--- a/backends/vulkan/test/utils.py
+++ b/backends/vulkan/test/utils.py
@@ -90,7 +90,9 @@ def export_model_to_vulkan(
     qmode=QuantizationMode.NONE,
 ):
     compile_options = {}
-    exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode)
+    exported_graph = get_exported_graph(
+        model, sample_inputs, dynamic_shapes=dynamic_shapes, qmode=qmode
+    )
     program = export(
         exported_graph,
         sample_inputs,
@@ -303,13 +305,13 @@ def run_and_check_output(
     Returns:
         bool: True if outputs match within tolerance, False otherwise
     """
-    # Load the ExecutorTorch program
+    # Load the ExecuTorch program
     executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer)
 
     # Flatten inputs for execution
     inputs_flattened, _ = tree_flatten(sample_inputs)
 
-    # Run the ExecutorTorch program
+    # Run the ExecuTorch program
     model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
 
     # Generate reference outputs using the reference model
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a193d02da88..189562178a7 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -187,6 +187,8 @@ std::vector<int64_t> get_reference_strides(
         default:
           return {};
       }
+    default:
+      VK_THROW("Unsupported memory layout: ", layout);
   }
   return {};
 }
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 96f200eecbc..09c57f649ae 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -128,7 +128,7 @@ def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool:
         is_get_attr_node(node)
         or is_param(program, node)
         or is_buffer(program, node)
-        or is_constant(program, node)
+        or is_lifted_tensor_constant(program, node)
     )
 
 
@@ -206,6 +206,8 @@ def is_tensor_arg_node(node: Any) -> bool:
     if isinstance(node, torch.fx.Node):
         return is_tensor_node(node)
     elif isinstance(node, (list, tuple)):
+        if len(node) == 0:
+            return False
         return all(is_tensor_node(n) for n in node)
 
     return False
@@ -348,6 +350,8 @@ def find_quant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]:
     VkMemoryLayout.TENSOR_WIDTH_PACKED,
     VkMemoryLayout.TENSOR_HEIGHT_PACKED,
     VkMemoryLayout.TENSOR_CHANNELS_PACKED,
+    VkMemoryLayout.PACKED_INT8_4W4C,
+    VkMemoryLayout.PACKED_INT8_4H4W,
 }
 
 MemoryLayoutSet = Set[VkMemoryLayout]
@@ -400,6 +404,12 @@ def required_image_extents(sizes: torch.Size, layout: VkMemoryLayout) -> ImageEx
         height = (height + 3) // 4
     elif layout == VkMemoryLayout.TENSOR_CHANNELS_PACKED:
         channels = (channels + 3) // 4
+    elif layout == VkMemoryLayout.PACKED_INT8_4W4C:
+        width = (width + 3) // 4
+        channels = (channels + 3) // 4
+    elif layout == VkMemoryLayout.PACKED_INT8_4H4W:
+        height = (height + 3) // 4
+        width = (width + 3) // 4
     else:
         raise RuntimeError(f"Unsupported memory layout {layout}")
 
@@ -692,6 +702,8 @@ def make_filtered_tensor_repset(
 
 ## Convenience TensorRepSet definitions
 
+PACKED_INT8_4W4C_BUFFER = TensorRepSet({VkMemoryLayout.PACKED_INT8_4W4C}, set())
+
 CONTIGUOUS_ANY = TensorRepSet(
     {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED}
 )
@@ -1218,6 +1230,16 @@ def is_in_8bit_range(tensor: torch.Tensor) -> bool:
 ##
 
 
+def nchw_dim_to_whcn_dim(nchw_dim: int, ndim: int) -> int:
+    # Handle negative indices for nchw_dim
+    if nchw_dim < 0:
+        nchw_dim += ndim
+
+    assert nchw_dim >= 0 and nchw_dim < ndim
+    whcn_dim = (ndim - 1) - nchw_dim
+    return whcn_dim
+
+
 def get_tensor_val_str(tensor_val: FakeTensor) -> str:
     return f"{tensor_val.dtype}: {tensor_val.shape}"
 
@@ -1269,6 +1291,7 @@ def update_program_state_dict(
     updated_tensor: torch.Tensor,
 ) -> None:
     target_name = None
+    kind = None
     # Iterate over all the tensors in the graph signature, and find
     # the one corresponding to the parameter/buffer name
     for input_ in program.graph_signature.input_specs:
@@ -1277,6 +1300,7 @@ def update_program_state_dict(
             and isinstance(input_.arg, TensorArgument)
             and input_.arg.name == buffer_name
         ):
+            kind = input_.kind
             target_name = input_.target
             break
 
@@ -1286,6 +1310,9 @@ def update_program_state_dict(
     ), f"could not find {buffer_name} in source program signature"
     assert target_name in program.state_dict, f"could not find {target_name}"
 
+    if kind == InputKind.PARAMETER:
+        updated_tensor = torch.nn.Parameter(updated_tensor, requires_grad=False)
+
     # Finally, overwrite the current tensor with updated tensor
     program.state_dict[target_name] = updated_tensor
 
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index 95da66494e0..876f7fa8900 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -8,7 +8,7 @@
 
 from functools import partial
 
-from typing import Any, Dict, final, List
+from typing import Any, Callable, Dict, final, List
 
 import executorch.backends.vulkan.utils as utils
 
@@ -24,6 +24,7 @@
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
     RemoveRedundantOpsTransform,
+    ReplaceQDQPass,
     SqueezeUnsqueezeInputs,
     TagMemoryMetaPass,
 )
@@ -55,7 +56,9 @@
 
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
-from executorch.exir.program._program import _copy_module
+from executorch.exir.program._program import _transform
+
+from torch._export.verifier import Verifier
 
 from torch.export._remove_auto_functionalized_pass import (
     unsafe_remove_auto_functionalized_pass,
@@ -64,28 +67,34 @@
 DEFAULT_DEBUG_HANDLE = 65535
 
 
+class _any_op(Verifier):
+    # Set training dialect to skip functional check in base verifier
+    dialect = "TRAINING"
+
+    def allowed_op_types(self):
+        return (Callable,)
+
+
 # pyre-ignore
 def apply_passes(program: ExportedProgram, passes) -> ExportedProgram:
     for p in passes:
-        if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
-            new_gm = program.graph_module
-            # This is a workaround to allow the memory planning pass to work without
-            # having to first apply ToOutVarPass(). See the `greedy()` function in
-            # `exir.memory_planning`; if this attribute isn't set, assertions in
-            # `collect_spec_from_nodes()` will fail.
-            if isinstance(p, MemoryPlanningPass):
-                new_gm.encounter_to_out_var_failure = True
-
-            new_gm_res = p(new_gm)
-            assert new_gm_res is not None
-            new_gm = new_gm_res.graph_module
-
+        if isinstance(p, MemoryPlanningPass) and hasattr(p, "run"):
+            p.run(program.graph_module)
+
+        elif issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
+            # Some passes require the ep to be provided. However, since the ep may be
+            # updated with each pass applied, the ep must be set right before calling
+            # the pass. _exported_program is the attribute used by XNNPACK and Vulkan
+            # passes to store the exported program.
+            if hasattr(p, "_exported_program"):
+                p._exported_program = program
+
+            program = _transform(program, p, override_verifiers=[_any_op])
             # See the application of this function in exir/program/_program.py for more
             # details on why this step is necessary.
             if isinstance(p, SpecPropPass):
-                p.update_placeholder_tensor_specs(program, new_gm)
+                p.update_placeholder_tensor_specs(program, program.graph_module)
 
-            _copy_module(program.graph_module, new_gm)
         else:
             program = p(program)
 
@@ -158,16 +167,17 @@ def preprocess(  # noqa: C901
         program = apply_passes(
             program,
             [
-                FusePatternsPass(program),
-                RemoveRedundantOpsTransform(),
+                FuseBatchNormPass(program),
+                FusePatternsPass(),
+                FuseClampPass(),
                 AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(program),
-                FoldQDQPass(program),
+                RemoveRedundantOpsTransform(),
+                FuseQuantizedOpsTransform(),
+                ReplaceQDQPass(),
+                FoldQDQPass(),
                 SqueezeUnsqueezeInputs(),
                 FuseViewCopyTransform(),
                 ViewCopyToSqueezeUnsqueezePass(),
-                FuseBatchNormPass(program),
-                FuseClampPass(),
             ],
         )
 
@@ -213,6 +223,11 @@ def preprocess(  # noqa: C901
         mem_planning_suite = MemoryPlanningAlgorithmSuite(
             algo_list=[greedy_memory_planning]
         )
+        # This is a workaround to allow the memory planning pass to work without having
+        # to first apply ToOutVarPass(). See the `greedy()` function in
+        # `exir.memory_planning`; if this attribute isn't set, assertions in
+        # `collect_spec_from_nodes()` will fail.
+        program.graph_module.encounter_to_out_var_failure = True
         program = apply_passes(
             program,
             [
diff --git a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
index 85e9889ca36..c1bc3a54f7c 100644
--- a/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
+++ b/backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
@@ -110,7 +110,9 @@ def is_nhwc_node(node: torch.fx.Node) -> bool:
             if len(quantize_node.all_input_nodes) > 0:
                 actual_node = quantize_node.args[0]
                 if actual_node.op == "placeholder":
-                    return not actual_node.meta["val"][0].is_contiguous()
+                    return ChannelsLastTaggedReshapePass._is_nhwc_tensor(
+                        actual_node.meta["val"][0]
+                    )
                 else:
                     return actual_node.meta.get(
                         ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
@@ -125,7 +127,9 @@ def is_nchw_node(node: torch.fx.Node) -> bool:
             if len(quantize_node.all_input_nodes) > 0:
                 actual_node = quantize_node.args[0]
                 if actual_node.op == "placeholder":
-                    return actual_node.meta["val"][0].is_contiguous()
+                    return not ChannelsLastTaggedReshapePass._is_nhwc_tensor(
+                        actual_node.meta["val"][0]
+                    )
                 else:
                     return not actual_node.meta.get(
                         ChannelsLastTaggedReshapePass.XNN_NHWC_NODE, False
@@ -133,6 +137,26 @@ def is_nchw_node(node: torch.fx.Node) -> bool:
 
         return not ChannelsLastTaggedReshapePass.is_nhwc_node(node)
 
+    @staticmethod
+    def _is_nhwc_tensor(tensor: torch.Tensor) -> bool:
+        nhwc = tensor.is_contiguous(memory_format=torch.channels_last)
+        nchw = tensor.is_contiguous()
+        # if both are true false
+        # if  both nchw and nhwc are true
+        #     then we want to see this is nchw hence return false
+        # if either of nchw or nhwc is false, then just rely on hwc
+        # if both are false, mayb channels_last_3d, then return nhwc
+        #    however this should not happen here
+        # return (not (nchw and nhwc)) and nhwc
+        # Readable version
+        if nchw and nhwc:
+            return False
+        else:
+            return nhwc
+
+    def _is_nhwc(self, tensor: torch.Tensor) -> bool:
+        return ChannelsLastTaggedReshapePass._is_nhwc_tensor(tensor)
+
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
         return node.target in self.memory_sensitive_ops_nhwc
 
@@ -315,11 +339,8 @@ def input_dim_order(
         self, input_node: torch.fx.Node, input_order: InputDimOrder
     ) -> bool:
         if input_node.op == "placeholder":
-            return (
-                input_node.meta["val"].is_contiguous()
-                if input_order == InputDimOrder.NCHW
-                else not input_node.meta["val"].is_contiguous()
-            )
+            is_nhwc = self._is_nhwc(input_node.meta["val"])
+            return not is_nhwc if input_order == InputDimOrder.NCHW else is_nhwc
         else:
             return (
                 ChannelsLastTaggedReshapePass.is_nchw_node(input_node)
@@ -348,7 +369,7 @@ def input_to_nhwc(
             self.mark_as_nhwc_node(input_node)
 
         if input_node.op == "placeholder":
-            if not input_node.meta["val"][0].is_contiguous():
+            if self._is_nhwc(input_node.meta["val"][0]):
                 return
         elif ChannelsLastTaggedReshapePass.is_nhwc_node(input_node):
             return
@@ -420,7 +441,7 @@ def input_to_nchw(
             self.mark_as_nchw_node(input_node)
 
         if input_node.op == "placeholder":
-            if input_node.meta["val"].is_contiguous():
+            if not self._is_nhwc(input_node.meta["val"]):
                 return
         elif ChannelsLastTaggedReshapePass.is_nchw_node(input_node):
             return
@@ -462,17 +483,17 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                     and isinstance(node.meta["val"], torch.Tensor)
                     and len(node.meta["val"].shape) == 4
                 ):
-                    if node.meta["val"].is_contiguous():
-                        self.mark_as_nchw_node(node)
-                    else:
+                    if self._is_nhwc(node.meta["val"]):
                         self.mark_as_nhwc_node(node)
+                    else:
+                        self.mark_as_nchw_node(node)
                 continue
 
             # Need special case for output node because it can have multiple output dim orders as we can output a tuple multiple nodes
             if node.op == "output":
                 out_tuple = node.args[0]
                 for out_node in out_tuple:
-                    if out_node.meta["val"].is_contiguous():
+                    if not self._is_nhwc(out_node.meta["val"]):
                         self.input_to_nchw(graph_module, out_node, node)
                     else:
                         self.input_to_nhwc(graph_module, out_node, node)
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
index d17b7abd6a1..93424b1c84d 100644
--- a/backends/xnnpack/operators/__init__.py
+++ b/backends/xnnpack/operators/__init__.py
@@ -41,6 +41,7 @@
     op_relu,
     op_rsqrt,
     op_sigmoid,
+    op_sin,
     op_skip_ops,
     op_slice_copy,
     op_softmax,
diff --git a/backends/xnnpack/operators/op_sin.py b/backends/xnnpack/operators/op_sin.py
new file mode 100644
index 00000000000..56fe9396103
--- /dev/null
+++ b/backends/xnnpack/operators/op_sin.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNGraph,
+    XNNSin,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class SinVisitor(NodeVisitor):
+    target = "aten.sin.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNSin(
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
index e393f1c9ac8..86baba3e3f7 100644
--- a/backends/xnnpack/partition/config/__init__.py
+++ b/backends/xnnpack/partition/config/__init__.py
@@ -45,6 +45,7 @@
     ReciprocalSquareRootConfig,
     ReLUConfig,
     SigmoidConfig,
+    SinConfig,
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
@@ -105,6 +106,7 @@
     TanhConfig,
     ToDimOrderCopyConfig,
     SigmoidConfig,
+    SinConfig,
     SliceCopyConfig,
     SoftmaxConfig,
     SquareRootConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
index 559d1522275..06024c632c9 100644
--- a/backends/xnnpack/partition/config/generic_node_configs.py
+++ b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -636,3 +636,10 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
+
+
+class SinConfig(GenericNodePartitionerConfig):
+    target_name = "sin.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
index 78eaaf6d039..b71ab08ea45 100644
--- a/backends/xnnpack/runtime/XNNCompiler.cpp
+++ b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -174,13 +174,12 @@ payload (deprecated) or via offsets to the constant_data_ptr. If no constant
 data associated with the tensor value, then returns nullptr.
 */
 const uint8_t* getConstantDataPtr(
-    const fb_xnnpack::XNNTensorValue* tensor_value,
+    uint32_t buffer_idx,
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
     const NamedDataMap* named_data_map,
     std::vector<FreeableBuffer>& freeable_buffers,
     XNNWeightsCache* weights_cache) {
-  auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
     if (!constant_data_ptr) {
       // TODO(T172265611): Remove constant_buffer in flatbuffer path after BC
@@ -230,6 +229,22 @@ const uint8_t* getConstantDataPtr(
   return nullptr;
 }
 
+const uint8_t* getConstantDataPtr(
+    const fb_xnnpack::XNNTensorValue* tensor_value,
+    GraphPtr flatbuffer_graph,
+    const uint8_t* constant_data_ptr,
+    const NamedDataMap* named_data_map,
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
+  return getConstantDataPtr(
+      tensor_value->constant_buffer_idx(),
+      flatbuffer_graph,
+      constant_data_ptr,
+      named_data_map,
+      freeable_buffers,
+      weights_cache);
+}
+
 /**
 Define serialized tensor value into
 the subgraph. While also keeping track of the remapped ids from
@@ -434,22 +449,15 @@ Error defineTensor(
         const float* scale = qparams->scale()->data();
 
         if (qparams->scale_buffer_idx() != 0) {
-          // if scales are stored in named data, then retrieve it
-          ConstantDataOffsetPtr scale_buffer_offset =
-              flatbuffer_graph->constant_data()->Get(
-                  qparams->scale_buffer_idx());
-          const std::string& data_name =
-              scale_buffer_offset->named_key()->str();
-          Result<FreeableBuffer> scale_buffer =
-              named_data_map->get_data(data_name.c_str());
+          scale = reinterpret_cast<const float*>(getConstantDataPtr(
+              qparams->scale_buffer_idx(),
+              flatbuffer_graph,
+              constant_data_ptr,
+              named_data_map,
+              freeable_buffers,
+              weights_cache));
           ET_CHECK_OR_RETURN_ERROR(
-              scale_buffer.ok(),
-              Internal,
-              "Failed to get constant data for key %s from named_data_map. Error code: %u",
-              data_name.c_str(),
-              static_cast<uint32_t>(scale_buffer.error()));
-          scale = reinterpret_cast<const float*>(scale_buffer.get().data());
-          freeable_buffers.push_back(std::move(scale_buffer.get()));
+              scale != nullptr, Internal, "Failed to load scale data.");
         }
         status = xnn_define_channelwise_quantized_tensor_value_v2(
             /*subgraph=*/subgraph_ptr,
@@ -483,22 +491,15 @@ Error defineTensor(
         // Block scales are preferably serialized as bf16 but can also be
         // serialized as fp32 for backwards compatability.
         if (qparams->scale_buffer_idx() != 0) {
-          ConstantDataOffsetPtr scale_buffer_offset =
-              flatbuffer_graph->constant_data()->Get(
-                  qparams->scale_buffer_idx());
-          const std::string& data_name =
-              scale_buffer_offset->named_key()->str();
-          Result<FreeableBuffer> scale_buffer =
-              named_data_map->get_data(data_name.c_str());
+          scale_data = reinterpret_cast<const uint16_t*>(getConstantDataPtr(
+              qparams->scale_buffer_idx(),
+              flatbuffer_graph,
+              constant_data_ptr,
+              named_data_map,
+              freeable_buffers,
+              weights_cache));
           ET_CHECK_OR_RETURN_ERROR(
-              scale_buffer.ok(),
-              Internal,
-              "Failed to get constant data for key %s from named_data_map. Error code: %u",
-              data_name.c_str(),
-              static_cast<uint32_t>(scale_buffer.error()));
-          scale_data =
-              reinterpret_cast<const uint16_t*>(scale_buffer.get().data());
-          freeable_buffers.push_back(std::move(scale_buffer.get()));
+              scale_data != nullptr, Internal, "Failed to load scale data.");
           scale_numel = qparams->num_scales();
         } else {
           // Read fp32 scales, convert to bf16.
@@ -1689,6 +1690,7 @@ _DEFINE_UNARY_NODE_NO_PARAMS(Log, xnn_unary_log)
 _DEFINE_UNARY_NODE_NO_PARAMS(Negate, xnn_unary_negate)
 _DEFINE_UNARY_NODE_NO_PARAMS(Square, xnn_unary_square)
 _DEFINE_UNARY_NODE_NO_PARAMS(Abs, xnn_unary_abs)
+_DEFINE_UNARY_NODE_NO_PARAMS(Sin, xnn_unary_sine)
 
 // Unary Ops with min/max params
 _DEFINE_UNARY_NODE_WITH_MINMAX(Clamp, xnn_unary_clamp)
@@ -1736,6 +1738,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(Floor)
     _DEFINE(PReLU)
     _DEFINE(Sigmoid)
+    _DEFINE(Sin)
 
     // Others
     _DEFINE(FullyConnected)
@@ -1895,9 +1898,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   xnn_weights_cache_t weights_cache_ptr = nullptr;
 #endif
 
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-  ET_CHECK_OR_RETURN_ERROR(
-      workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
+  // NOLINTBEGIN(facebook-hte-NullableDereference) - weights cache is allowed to
+  // be null
   status = xnn_create_runtime_v4(
       subgraph.get(),
       weights_cache_ptr,
@@ -1905,14 +1907,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
-#else
-  status = xnn_create_runtime_v3(
-      subgraph.get(),
-      weights_cache_ptr,
-      ::executorch::extension::threadpool::get_pthreadpool(),
-      runtime_flags,
-      &runtime_ptr);
-#endif
+  // NOLINTEND(facebook-hte-NullableDereference)
 
   ET_CHECK_OR_RETURN_ERROR(
       xnn_status_success == status,
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
index f7084a5dd88..c7926744dd6 100644
--- a/backends/xnnpack/runtime/XNNExecutor.h
+++ b/backends/xnnpack/runtime/XNNExecutor.h
@@ -9,13 +9,13 @@
 #pragma once
 
 #include <executorch/backends/xnnpack/runtime/XNNStatus.h>
+#include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
 #include <executorch/backends/xnnpack/runtime/profiling/XNNProfiler.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 
 #include <xnnpack.h>
-#include <map>
 #include <memory>
 #include <vector>
 
@@ -35,9 +35,11 @@ class XNNExecutor {
   std::vector<uint32_t> output_ids_;
   std::vector<xnn_external_value> externals_;
   std::vector<std::string> packed_data_names_;
+  std::shared_ptr<XNNWorkspace> workspace_;
 
  public:
-  XNNExecutor() = default;
+  XNNExecutor(std::shared_ptr<XNNWorkspace> workspace)
+      : workspace_(workspace) {}
 
   inline size_t getNumInputs() {
     return input_ids_.size();
@@ -51,6 +53,10 @@ class XNNExecutor {
     return packed_data_names_;
   }
 
+  inline std::shared_ptr<XNNWorkspace> get_workspace() {
+    return workspace_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index b05919ecf2b..70845b6cab1 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -7,7 +7,10 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
+#include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
 #include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
+#include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
+#include <executorch/backends/xnnpack/runtime/XNNWorkspaceManager.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
@@ -21,14 +24,18 @@
 namespace executorch {
 namespace backends {
 
+using executorch::backends::xnnpack::WorkspaceSharingMode;
+using executorch::backends::xnnpack::XNNWorkspace;
 using executorch::backends::xnnpack::delegate::XNNWeightsCache;
 using executorch::ET_RUNTIME_NAMESPACE::Backend;
 using executorch::ET_RUNTIME_NAMESPACE::BackendExecutionContext;
 using executorch::ET_RUNTIME_NAMESPACE::BackendInitContext;
+using executorch::ET_RUNTIME_NAMESPACE::BackendOptionContext;
 using executorch::ET_RUNTIME_NAMESPACE::CompileSpec;
 using executorch::ET_RUNTIME_NAMESPACE::DelegateHandle;
 using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
 using executorch::runtime::ArrayRef;
+using executorch::runtime::BackendOption;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
@@ -51,23 +58,8 @@ class XnnpackBackend final
       return;
     }
 
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    // Create a workspace for the XNNExecutor to use. This workspace will be
-    // shared across all delegate instances.
-    ET_LOG(Debug, "Creating XNN workspace");
-    xnn_workspace_t workspace = nullptr;
-    status = xnn_create_workspace(&workspace);
-    if (status != xnn_status_success) {
-      ET_LOG(
-          Error,
-          "Failed to create XNN workspace, XNNPACK status: 0x%x",
-          (unsigned int)status);
-      workspace = nullptr;
-      return;
-    }
-    workspace_.reset(workspace);
-    ET_LOG(Debug, "Created XNN workspace: %p", workspace_.get());
-#endif // ENABLE_XNNPACK_SHARED_WORKSPACE
+    // Workspace manager is initialized with the appropriate default mode in its
+    // constructor
   }
 
   bool is_available() const override {
@@ -85,11 +77,12 @@ class XnnpackBackend final
     }
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-    // thread safe. This can heppen when multiple threads call init() on
+    // thread safe. This can happen when multiple threads call init() on
     // the same backend instance.
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    const std::lock_guard<std::mutex> lock(workspace_mutex_);
-#endif
+
+    auto program_id =
+        reinterpret_cast<uintptr_t>(context.get_runtime_allocator());
+    auto workspace = ET_UNWRAP(get_or_create_workspace(program_id));
 
 #ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
     const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
@@ -97,17 +90,19 @@ class XnnpackBackend final
         context.get_runtime_allocator(), named_data_map);
 #endif
 
+    auto [workspace_lock, workspace_ptr] = workspace->acquire();
+
     // Executor has been allocated but not constructed, ensure that runtime_ is
     // nullptr by constructing it in place here. NOTE: Since we use placement
     // new and since this type is not trivially destructible, we must call the
     // destructor manually in destroy().
-    new (executor) xnnpack::delegate::XNNExecutor;
+    new (executor) xnnpack::delegate::XNNExecutor(workspace);
     Error err = xnnpack::delegate::XNNCompiler::compileModel(
         processed->data(),
         processed->size(),
         executor,
         weights_cache_.get(),
-        workspace_.get(),
+        workspace_ptr,
         named_data_map);
     // This backend does not need its processed data after compiling the model.
     processed->Free();
@@ -130,14 +125,12 @@ class XnnpackBackend final
       Span<EValue*> args) const override {
     auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
 
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    const std::lock_guard<std::mutex> lock(workspace_mutex_);
-#endif
-
 #ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
     const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
 #endif
 
+    auto [raii_lock, _] = executor->get_workspace()->acquire();
+
     // Prepare Inputs/Outputs and Propagate Input Shapes
     Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
@@ -158,13 +151,6 @@ class XnnpackBackend final
 
   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
-      // This is needed to serialize access to xnn_delete_runtime which is not
-      // thread safe. This can heppen when multiple threads call destroy() on
-      // the same backend instance.
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-      const std::lock_guard<std::mutex> lock(workspace_mutex_);
-#endif
-
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
 
 #ifdef ENABLE_XNNPACK_PROFILING
@@ -176,18 +162,87 @@ class XnnpackBackend final
           weights_cache_mutex_);
       weights_cache_->delete_packed_data(executor->get_packed_data_names());
 #endif
+
+      // This is needed to serialize access to xnn_delete_runtime which is not
+      // thread safe. This can heppen when multiple threads call destroy() on
+      // the same backend instance. Make sure to hold onto the workspace
+      // shared_ptr, as the pointer in the executor is freed, which includes
+      // the mutex referenced by raii_lock.
+      auto workspace = executor->get_workspace();
+      auto [raii_lock, _] = workspace->acquire();
+
       // XNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~XNNExecutor();
     }
   }
 
+  Error get_option_internal(
+      BackendOptionContext& context,
+      executorch::runtime::Span<executorch::runtime::BackendOption>&
+          backend_options) const {
+    // Intentionally not locking here as it is not required.
+
+    // Verify that the expected option key is present and modify the value
+    for (size_t i = 0; i < backend_options.size(); ++i) {
+      if (strcmp(
+              backend_options[i].key,
+              xnnpack::workspace_sharing_mode_option_key) == 0) {
+        // Set the value to what was stored by set_option
+        backend_options[i].value =
+            static_cast<int>(workspace_manager_.get_sharing_mode());
+      }
+    }
+
+    return Error::Ok;
+  }
+
+  Error get_option(
+      BackendOptionContext& context,
+      executorch::runtime::Span<executorch::runtime::BackendOption>&
+          backend_options) override {
+    return get_option_internal(context, backend_options);
+  }
+
+  Error set_option(
+      BackendOptionContext& context,
+      const executorch::runtime::Span<executorch::runtime::BackendOption>&
+          backend_options) override {
+    if (backend_options.size() > 0) {
+      for (const auto& option : backend_options) {
+        if (strcmp(option.key, xnnpack::workspace_sharing_mode_option_key) ==
+            0) {
+          if (auto* val = std::get_if<int>(&option.value)) {
+            if (*val < 0 ||
+                *val > static_cast<int>(WorkspaceSharingMode::Count)) {
+              ET_LOG(
+                  Error,
+                  "XNNPACK workspace sharing mode must be between 0 and %d, inclusive, but was %d.",
+                  static_cast<int>(WorkspaceSharingMode::Count),
+                  *val);
+              return Error::InvalidArgument;
+            }
+
+            ET_LOG(
+                Debug, "Setting XNNPACK workspace sharing mode to %d.", *val);
+            auto status = workspace_manager_.set_sharing_mode(
+                static_cast<WorkspaceSharingMode>(*val));
+            if (status != Error::Ok) {
+              return status;
+            }
+          } else {
+            ET_LOG(Error, "XNNPACK workspace sharing mode must be an integer.");
+            return Error::InvalidArgument;
+          }
+        }
+      }
+    }
+    return Error::Ok;
+  }
+
  private:
-  // This is a global workspace for all delegate instances.
-  mutable std::mutex workspace_mutex_;
-  std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
-      nullptr,
-      &xnn_release_workspace};
+  // Workspace manager for handling workspace sharing modes
+  mutable xnnpack::XNNWorkspaceManager workspace_manager_;
 
   // Weights cache is global to all delegate instances.
   mutable std::mutex weights_cache_mutex_;
@@ -195,13 +250,21 @@ class XnnpackBackend final
       std::make_unique<XNNWeightsCache>();
 
   // Lock Hiearchy for Mutexes:
-  // workspace_mutex_
   // weights_cache_mutex_
+  // workspace_meta_mutex_
+  // workspace_mutex_ (owned by executor)
+
+  // Retrieve a workspace for the given method ID, depending on the sharing
+  // mode.
+  Result<std::shared_ptr<XNNWorkspace>> get_or_create_workspace(
+      uintptr_t program_id) const {
+    return workspace_manager_.get_or_create_workspace(program_id);
+  }
 };
 
 namespace {
-auto cls = XnnpackBackend();
-Backend backend{"XnnpackBackend", &cls};
+auto backend_instance = XnnpackBackend();
+Backend backend{xnnpack::xnnpack_backend_key, &backend_instance};
 static auto success_with_compiler = register_backend(backend);
 } // namespace
 
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.h b/backends/xnnpack/runtime/XNNPACKBackend.h
new file mode 100644
index 00000000000..aca72f8652b
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNPACKBackend.h
@@ -0,0 +1,42 @@
+#pragma once
+
+namespace executorch::backends::xnnpack {
+/// The key for the backend. This is used to register the backend, check
+/// availability, and get/set options.
+const char xnnpack_backend_key[] = "XnnpackBackend";
+
+/// The key for the workspace sharing option. See the WorkspaceSharingMode enum
+/// for a description of the associated functionality.
+const char workspace_sharing_mode_option_key[] = "workspace_sharing_mode";
+
+/// Workspace sharing mode. This is a backend option that can be set via the
+/// set_option API to control memory sharing between CALL_DELEGATE instances.
+/// This is useful for reducing memory consumption.
+enum class WorkspaceSharingMode {
+  /// No workspace sharing. Each CALL_DELEGATE instance will have its own
+  /// workspace (memory arena).
+  Disabled = 0,
+
+  /// All CALL_DELEGATE instances in a given program will share a workspace.
+  /// This reduces memory consumption
+  /// for methods with multiple delegate calls, at the cost of only allowing one
+  /// method to execute at a time.
+  PerModel = 1,
+
+  /// All CALL_DELEGATE instances accross all loaded methods will share a
+  /// workspace. This reduces memory
+  /// consumption by overlapping activation memory between methods but enforces
+  /// synchronization between
+  /// methods. If multiple methods are run concurrently, it may block as only
+  /// one delegate call occur
+  /// at a time. Additionally, the workspace does not shrink when a method is
+  /// unloaded, so memory will
+  /// only be reclaimed when all XNNPACK-delegated methods are unloaded.
+  Global = 2,
+
+  /// The number of workspace sharing modes. This is not a valid mode and is
+  /// only used for tracking the
+  // maximum enum value.
+  Count,
+};
+} // namespace executorch::backends::xnnpack
diff --git a/backends/xnnpack/runtime/XNNWorkspace.h b/backends/xnnpack/runtime/XNNWorkspace.h
new file mode 100644
index 00000000000..36596b05089
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWorkspace.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/result.h>
+#include <xnnpack.h>
+
+#include <memory>
+#include <mutex>
+#include <utility>
+
+namespace executorch::backends::xnnpack {
+
+using WorkspacePtr =
+    std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)>;
+
+/// A lightweight wrapper around an underlying xnn_workspace_t instance, bundled
+/// with appropriate synchronization.
+class XNNWorkspace {
+ public:
+  XNNWorkspace(WorkspacePtr workspace) : workspace_(std::move(workspace)){};
+  XNNWorkspace(const XNNWorkspace&) = delete;
+  XNNWorkspace& operator=(const XNNWorkspace&) = delete;
+  // Not moveable due to std::mutex.
+  XNNWorkspace(XNNWorkspace&&) = delete;
+  XNNWorkspace& operator=(XNNWorkspace&&) = delete;
+
+  std::pair<std::unique_lock<std::mutex>, xnn_workspace_t> acquire() {
+    auto lock = std::unique_lock<std::mutex>(mutex_);
+    return {std::move(lock), workspace_.get()};
+  }
+
+  // Return the workspace pointer withot acquiring the lock. This should be used
+  // carefully, as it can lead to crashes or data corruption if the workspace is
+  // used concurrently.s
+  xnn_workspace_t unsafe_get_workspace() {
+    return workspace_.get();
+  }
+
+  static runtime::Result<std::shared_ptr<XNNWorkspace>> create() {
+    // Because this class can't be moved, we need to construct it in-place.
+    xnn_workspace_t workspace = nullptr;
+    auto status = xnn_create_workspace(&workspace);
+    if (status != xnn_status_success) {
+      ET_LOG(
+          Error,
+          "Failed to create XNN workspace, XNNPACK status: 0x%x",
+          (unsigned int)status);
+      return runtime::Error::Internal;
+    }
+
+    return std::make_shared<XNNWorkspace>(
+        WorkspacePtr(workspace, &xnn_release_workspace));
+  }
+
+ private:
+  std::mutex mutex_;
+  WorkspacePtr workspace_;
+};
+
+} // namespace executorch::backends::xnnpack
diff --git a/backends/xnnpack/runtime/XNNWorkspaceManager.cpp b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp
new file mode 100644
index 00000000000..d8c6dae4d6d
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWorkspaceManager.h>
+#include <executorch/runtime/core/error.h>
+#include <cinttypes> // For PRIuPTR
+
+namespace executorch::backends::xnnpack {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+XNNWorkspaceManager::XNNWorkspaceManager() {
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
+  sharing_mode_ = WorkspaceSharingMode::Global;
+#else
+  sharing_mode_ = WorkspaceSharingMode::Disabled;
+#endif // ENABLE_XNNPACK_SHARED_WORKSPACE
+}
+
+runtime::Error XNNWorkspaceManager::set_sharing_mode(
+    WorkspaceSharingMode mode) {
+  // Validate that the mode is valid
+  if (static_cast<int>(mode) < 0 ||
+      static_cast<int>(mode) >= static_cast<int>(WorkspaceSharingMode::Count)) {
+    ET_LOG(
+        Error,
+        "XNNPACK workspace sharing mode must be between 0 and %d, inclusive, but was %d.",
+        static_cast<int>(WorkspaceSharingMode::Count) - 1,
+        static_cast<int>(mode));
+    return runtime::Error::InvalidArgument;
+  }
+
+  sharing_mode_ = mode;
+  return runtime::Error::Ok;
+}
+
+WorkspaceSharingMode XNNWorkspaceManager::get_sharing_mode() const {
+  return sharing_mode_.load();
+}
+
+Result<std::shared_ptr<XNNWorkspace>>
+XNNWorkspaceManager::get_or_create_workspace(uintptr_t program_id) const {
+  auto mode = sharing_mode_.load();
+
+  // Get or create the workspace according to the current sharing mode.
+  if (mode == WorkspaceSharingMode::Disabled) {
+    ET_LOG(Debug, "Instantiating workspace.");
+    auto create_result = XNNWorkspace::create();
+    if (!create_result.ok()) {
+      return create_result.error();
+    }
+
+    return create_result.get();
+  } else if (mode == WorkspaceSharingMode::PerModel) {
+    return get_or_create_model_workspace(program_id);
+  } else if (mode == WorkspaceSharingMode::Global) {
+    return get_or_create_global_workspace();
+  } else {
+    ET_LOG(
+        Error, "Invalid workspace sharing mode: %d.", static_cast<int>(mode));
+    return Error::Internal;
+  }
+}
+
+Result<std::shared_ptr<XNNWorkspace>>
+XNNWorkspaceManager::get_or_create_global_workspace() const {
+  std::scoped_lock<std::mutex> lock(workspace_meta_mutex_);
+
+  // Check for an existing (live) global workspace.
+  std::shared_ptr<XNNWorkspace> workspace = {};
+  if (auto live_workspace = global_workspace_.lock()) {
+    workspace = live_workspace;
+  }
+
+  // Allocate a new workspace if needed.
+  if (!workspace) {
+    auto create_result = XNNWorkspace::create();
+    if (!create_result.ok()) {
+      return create_result.error();
+    }
+    workspace = create_result.get();
+    ET_LOG(
+        Debug,
+        "Created global workspace %p.",
+        workspace->unsafe_get_workspace());
+    global_workspace_ = workspace;
+  }
+
+  return workspace;
+}
+
+Result<std::shared_ptr<XNNWorkspace>>
+XNNWorkspaceManager::get_or_create_model_workspace(uintptr_t program_id) const {
+  std::scoped_lock<std::mutex> lock(workspace_meta_mutex_);
+
+  // Check for an existing (live) workspace for this program.
+  auto match = model_workspaces_.find(program_id);
+  std::shared_ptr<XNNWorkspace> workspace = {};
+  if (match != model_workspaces_.end()) {
+    if (auto live_workspace = match->second.lock()) {
+      workspace = live_workspace;
+    }
+  }
+
+  // Allocate a new workspace if needed.
+  if (!workspace) {
+    auto create_result = XNNWorkspace::create();
+    if (!create_result.ok()) {
+      return create_result.error();
+    }
+    workspace = create_result.get();
+    ET_LOG(
+        Debug,
+        "Created workspace %p for program %" PRIuPTR ".",
+        workspace->unsafe_get_workspace(),
+        program_id);
+    model_workspaces_.insert(
+        {program_id, std::weak_ptr<XNNWorkspace>(workspace)});
+  }
+
+  return workspace;
+}
+
+} // namespace executorch::backends::xnnpack
diff --git a/backends/xnnpack/runtime/XNNWorkspaceManager.h b/backends/xnnpack/runtime/XNNWorkspaceManager.h
new file mode 100644
index 00000000000..52db1184bbd
--- /dev/null
+++ b/backends/xnnpack/runtime/XNNWorkspaceManager.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
+#include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
+#include <executorch/runtime/core/result.h>
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+namespace executorch::backends::xnnpack {
+
+/**
+ * XNNWorkspaceManager manages XNNPACK workspaces based on the configured
+ * workspace sharing mode.
+ *
+ * It supports three modes:
+ * - Disabled: Each delegate instance gets its own workspace
+ * - PerModel: All delegate instances in a model share a workspace
+ * - Global: All delegate instances across all models share a workspace
+ */
+class XNNWorkspaceManager {
+ public:
+  XNNWorkspaceManager();
+  ~XNNWorkspaceManager() = default;
+
+  /**
+   * Set the workspace sharing mode.
+   *
+   * @param mode The workspace sharing mode to set.
+   * @return Error::Ok if the mode was set successfully.
+   */
+  runtime::Error set_sharing_mode(WorkspaceSharingMode mode);
+
+  /**
+   * Get the current workspace sharing mode.
+   *
+   * @return The current workspace sharing mode.
+   */
+  WorkspaceSharingMode get_sharing_mode() const;
+
+  /**
+   * Retrieve a workspace for the given program ID, depending on the sharing
+   * mode. A workspace will be created if needed.
+   *
+   * @param program_id The ID of the program requesting a workspace.
+   * @return A Result containing a shared_ptr to the workspace, or an error.
+   */
+  runtime::Result<std::shared_ptr<XNNWorkspace>> get_or_create_workspace(
+      uintptr_t program_id) const;
+
+ private:
+  // The active sharing mode. Changes to this affect only models loaded after
+  // the change.
+  std::atomic<WorkspaceSharingMode> sharing_mode_;
+
+  // A mutex guarding global_workspace_ and model_workspaces_. Note that this
+  // mutex only guards the top-level definitions, not the contents of the
+  // workspace. The contents of the workspace are guarded by the workspace's own
+  // mutex in the XNNWorkspace class.
+  mutable std::mutex workspace_meta_mutex_;
+
+  // A global workspace for all delegate instances, if global sharing is
+  // enabled. Lazy initialized. Stored as a weak pointer to allow automatic
+  // cleanup when all references are released.
+  mutable std::weak_ptr<XNNWorkspace> global_workspace_;
+
+  // A map from program id to workspace for delegate instances, if per model
+  // sharing is enabled. Workspaces are owned by the executor instances via
+  // shared_ptr. They are tracked here via weak pointers to allow automatic
+  // cleanup when the executors are destroyed while being retrievable when
+  // instantiating new executors.
+  mutable std::unordered_map<uintptr_t, std::weak_ptr<XNNWorkspace>>
+      model_workspaces_;
+
+  // Retrieve the global workspace, lazy initializing it if needed.
+  runtime::Result<std::shared_ptr<XNNWorkspace>>
+  get_or_create_global_workspace() const;
+
+  // Get or create a workspace for the given program ID.
+  runtime::Result<std::shared_ptr<XNNWorkspace>> get_or_create_model_workspace(
+      uintptr_t program_id) const;
+};
+
+} // namespace executorch::backends::xnnpack
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
index 950318f18dc..239f92d899e 100644
--- a/backends/xnnpack/serialization/runtime_schema.fbs
+++ b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -156,6 +156,7 @@ union XNodeUnion {
   XNNGelu: _XNNNode1x1,
   XNNTanh: _XNNNode1x1,
   XNNExp: _XNNNode1x1,
+  XNNSin: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
index a4efc627cbb..92a61c5537b 100644
--- a/backends/xnnpack/serialization/schema.fbs
+++ b/backends/xnnpack/serialization/schema.fbs
@@ -152,6 +152,7 @@ union XNodeUnion {
   XNNGelu: _XNNNode1x1,
   XNNTanh: _XNNNode1x1,
   XNNExp: _XNNNode1x1,
+  XNNSin: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
index 99b64708f86..2b3f8e74202 100644
--- a/backends/xnnpack/serialization/xnnpack_graph_schema.py
+++ b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -347,6 +347,11 @@ class XNNPReLU(XNNNode2x1):
     pass
 
 
+@dataclass
+class XNNSin(XNNNode1x1):
+    pass
+
+
 @dataclass
 class XNNScaledDotProductAttention:
     query_id: int
@@ -402,6 +407,8 @@ class XNNScaledDotProductAttention:
     XNNLog,
     XNNGelu,
     XNNTanh,
+    XNNExp,
+    XNNSin,
 ]
 
 
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
index 0eab89a00f9..796fd887e33 100644
--- a/backends/xnnpack/targets.bzl
+++ b/backends/xnnpack/targets.bzl
@@ -59,6 +59,9 @@ def define_common_targets():
             exported_deps = [
                 "//executorch/runtime/backend:interface" + aten_suffix,
             ],
+            exported_headers = [
+                "runtime/XNNPACKBackend.h",
+            ],
             deps = [
                 third_party_dep("XNNPACK"),
                 "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
@@ -70,3 +73,13 @@ def define_common_targets():
             # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
             link_whole = True,
         )
+    
+    runtime.cxx_library(
+        name = "xnnpack_interface",
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_headers = [
+            "runtime/XNNPACKBackend.h",
+        ],
+    )
diff --git a/backends/xnnpack/test/ops/test_sin.py b/backends/xnnpack/test/ops/test_sin.py
new file mode 100644
index 00000000000..6a1b323e14c
--- /dev/null
+++ b/backends/xnnpack/test/ops/test_sin.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestSin(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    class Sin(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            z = torch.sin(x)
+            return z
+
+    def _test_sin(self, inputs, legacy_mode: bool = False):
+        tester = (
+            Tester(self.Sin(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.sin.default": 1})
+        )
+
+        if legacy_mode:
+            tester = tester.to_edge().partition()
+        else:
+            tester = tester.to_edge_transform_and_lower()
+
+        (
+            tester.check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_sin_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_sin(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ).to(torch.float16),
+        )
+        self._test_sin(inputs, legacy_mode=False)
+
+    def test_fp16_sin_legacy_mode(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ).to(torch.float16),
+        )
+        self._test_sin(inputs, legacy_mode=True)
+
+    def test_fp32_sin(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ),
+        )
+        self._test_sin(inputs, legacy_mode=False)
+
+    def test_fp32_sin_legacy_mode(self):
+        inputs = (
+            torch.Tensor(
+                [
+                    [0.0, 0.1, 0.5, 0.785398],
+                    [-0.5, -0.785398, 1.5708, -1.5708],
+                ],
+            ),
+        )
+        self._test_sin(inputs, legacy_mode=True)
diff --git a/backends/xnnpack/test/runtime/test_workspace_manager.cpp b/backends/xnnpack/test/runtime/test_workspace_manager.cpp
new file mode 100644
index 00000000000..ddb7074a1ce
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_workspace_manager.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
+#include <executorch/backends/xnnpack/runtime/XNNWorkspaceManager.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <xnnpack.h>
+
+using namespace ::testing;
+
+using executorch::backends::xnnpack::WorkspaceSharingMode;
+using executorch::backends::xnnpack::XNNWorkspace;
+using executorch::backends::xnnpack::XNNWorkspaceManager;
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+class XNNWorkspaceManagerTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Log calls will abort if PAL is not initialized.
+    executorch::runtime::runtime_init();
+
+    // Initialize a new workspace manager for each test.
+    workspace_manager_ = std::make_unique<XNNWorkspaceManager>();
+  }
+
+  std::unique_ptr<XNNWorkspaceManager> workspace_manager_;
+};
+
+TEST_F(XNNWorkspaceManagerTest, SetAndGetSharingMode) {
+  // Test setting and getting the sharing mode
+  EXPECT_EQ(
+      workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled),
+      Error::Ok);
+  EXPECT_EQ(
+      workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Disabled);
+
+  EXPECT_EQ(
+      workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel),
+      Error::Ok);
+  EXPECT_EQ(
+      workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::PerModel);
+
+  EXPECT_EQ(
+      workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global),
+      Error::Ok);
+  EXPECT_EQ(
+      workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Global);
+}
+
+TEST_F(XNNWorkspaceManagerTest, SetInvalidSharingMode) {
+  // First set a valid mode to ensure we're starting from a known state.
+  EXPECT_EQ(
+      workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled),
+      Error::Ok);
+  EXPECT_EQ(
+      workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Disabled);
+
+  // Try to set an invalid mode.
+  WorkspaceSharingMode invalid_mode = static_cast<WorkspaceSharingMode>(70);
+  EXPECT_EQ(
+      workspace_manager_->set_sharing_mode(invalid_mode),
+      Error::InvalidArgument);
+
+  // The mode should not have changed.
+  EXPECT_EQ(
+      workspace_manager_->get_sharing_mode(), WorkspaceSharingMode::Disabled);
+}
+
+TEST_F(XNNWorkspaceManagerTest, DisabledMode) {
+  // Verify that each call retrieves a new workspace when sharing is disabled.
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled);
+
+  uintptr_t program_id = 12345;
+  auto workspace1_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace1_result.ok());
+  auto workspace1 = workspace1_result.get();
+
+  auto workspace2_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace2_result.ok());
+  auto workspace2 = workspace2_result.get();
+
+  auto workspace3_result =
+      workspace_manager_->get_or_create_workspace(program_id + 1);
+  ASSERT_TRUE(workspace3_result.ok());
+  auto workspace3 = workspace3_result.get();
+
+  EXPECT_NE(workspace1, workspace2);
+  EXPECT_NE(workspace1, workspace3);
+  EXPECT_NE(workspace2, workspace3);
+  EXPECT_NE(
+      workspace1->unsafe_get_workspace(), workspace2->unsafe_get_workspace());
+  EXPECT_NE(
+      workspace1->unsafe_get_workspace(), workspace3->unsafe_get_workspace());
+  EXPECT_NE(
+      workspace2->unsafe_get_workspace(), workspace3->unsafe_get_workspace());
+}
+
+TEST_F(XNNWorkspaceManagerTest, PerModelMode) {
+  // In PerModel mode, calls with the same program_id should return the same
+  // workspace.
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel);
+
+  // Get two workspaces with the same program ID and one different.
+  uintptr_t program_id = 12345;
+  auto workspace1_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace1_result.ok());
+  auto workspace1 = workspace1_result.get();
+
+  auto workspace2_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace2_result.ok());
+  auto workspace2 = workspace2_result.get();
+
+  auto workspace3_result =
+      workspace_manager_->get_or_create_workspace(program_id + 1);
+  ASSERT_TRUE(workspace3_result.ok());
+  auto workspace3 = workspace3_result.get();
+
+  // Workspace 1 and 2 should be the same, but different from workspace 3.
+  EXPECT_EQ(workspace1, workspace2);
+  EXPECT_EQ(
+      workspace1->unsafe_get_workspace(), workspace2->unsafe_get_workspace());
+
+  EXPECT_NE(workspace1, workspace3);
+  EXPECT_NE(
+      workspace1->unsafe_get_workspace(), workspace3->unsafe_get_workspace());
+}
+
+TEST_F(XNNWorkspaceManagerTest, GlobalMode) {
+  // In Global mode, all calls should return the same workspace.
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global);
+
+  // Get workspaces with different program IDs
+  uintptr_t program_id1 = 12345;
+  auto workspace1_result =
+      workspace_manager_->get_or_create_workspace(program_id1);
+  ASSERT_TRUE(workspace1_result.ok());
+  auto workspace1 = workspace1_result.get();
+
+  uintptr_t program_id2 = 67890;
+  auto workspace2_result =
+      workspace_manager_->get_or_create_workspace(program_id2);
+  ASSERT_TRUE(workspace2_result.ok());
+  auto workspace2 = workspace2_result.get();
+
+  EXPECT_EQ(workspace1, workspace2);
+  EXPECT_EQ(
+      workspace1->unsafe_get_workspace(), workspace2->unsafe_get_workspace());
+}
+
+TEST_F(XNNWorkspaceManagerTest, PerModelModeCleanup) {
+  // Test that workspaces are properly cleaned up when shared_ptr is destroyed
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel);
+
+  uintptr_t program_id = 12345;
+  xnn_workspace_t raw_workspace1 = nullptr;
+
+  // Create a scope to control the lifetime of workspace1
+  {
+    auto workspace1_result =
+        workspace_manager_->get_or_create_workspace(program_id);
+    ASSERT_TRUE(workspace1_result.ok());
+    auto workspace1 = workspace1_result.get();
+
+    // Store the raw pointer for later comparison
+    raw_workspace1 = workspace1->unsafe_get_workspace();
+
+    // Let workspace1 go out of scope and be destroyed
+  }
+
+  // Get a new workspace with the same program ID
+  auto workspace2_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace2_result.ok());
+  auto workspace2 = workspace2_result.get();
+
+  // Since the previous workspace was destroyed, we should get a new one.
+  EXPECT_NE(workspace2->unsafe_get_workspace(), raw_workspace1);
+}
+
+TEST_F(XNNWorkspaceManagerTest, GlobalModeCleanup) {
+  // Test that global workspaces are properly cleaned up when all users
+  // are destroyed.
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global);
+
+  uintptr_t program_id = 12345;
+  xnn_workspace_t raw_workspace1 = nullptr;
+
+  // Create a scope to control the lifetime of workspace1
+  {
+    auto workspace1_result =
+        workspace_manager_->get_or_create_workspace(program_id);
+    ASSERT_TRUE(workspace1_result.ok());
+    auto workspace1 = workspace1_result.get();
+
+    // Store the raw pointer for later comparison
+    raw_workspace1 = workspace1->unsafe_get_workspace();
+
+    // Let workspace1 go out of scope and be destroyed
+  }
+
+  // Get a new workspace (program ID doesn't matter in Global mode)
+  auto workspace2_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace2_result.ok());
+  auto workspace2 = workspace2_result.get();
+
+  // Since the previous workspace was destroyed, we should get a new one.
+  EXPECT_NE(workspace2->unsafe_get_workspace(), raw_workspace1);
+}
+
+TEST_F(XNNWorkspaceManagerTest, SwitchingModes) {
+  // Test switching between different sharing modes
+
+  // Start with Disabled mode
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Disabled);
+
+  // Get a workspace
+  uintptr_t program_id = 12345;
+  auto workspace1_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace1_result.ok());
+  auto workspace1 = workspace1_result.get();
+
+  // Switch to PerModel mode
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::PerModel);
+
+  // Get another workspace with the same program ID
+  auto workspace2_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace2_result.ok());
+  auto workspace2 = workspace2_result.get();
+
+  // Should be a different workspace
+  EXPECT_NE(workspace1, workspace2);
+
+  // Get another workspace with the same program ID in PerModel mode
+  auto workspace3_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace3_result.ok());
+  auto workspace3 = workspace3_result.get();
+
+  // Should be the same workspace as workspace2
+  EXPECT_EQ(workspace2, workspace3);
+
+  // Switch to Global mode
+  workspace_manager_->set_sharing_mode(WorkspaceSharingMode::Global);
+
+  // Get another workspace
+  auto workspace4_result =
+      workspace_manager_->get_or_create_workspace(program_id);
+  ASSERT_TRUE(workspace4_result.ok());
+  auto workspace4 = workspace4_result.get();
+
+  // Should be a different workspace since we switched modes
+  EXPECT_NE(workspace3, workspace4);
+
+  // Get a workspace with a different program ID in Global mode
+  uintptr_t different_program_id = 67890;
+  auto workspace5_result =
+      workspace_manager_->get_or_create_workspace(different_program_id);
+  ASSERT_TRUE(workspace5_result.ok());
+  auto workspace5 = workspace5_result.get();
+
+  // Should be the same workspace as workspace4
+  EXPECT_EQ(workspace4, workspace5);
+}
diff --git a/backends/xnnpack/test/runtime/test_workspace_sharing.cpp b/backends/xnnpack/test/runtime/test_workspace_sharing.cpp
new file mode 100644
index 00000000000..66f0d012acd
--- /dev/null
+++ b/backends/xnnpack/test/runtime/test_workspace_sharing.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <optional>
+
+using namespace ::testing;
+
+using executorch::backends::xnnpack::workspace_sharing_mode_option_key;
+using executorch::backends::xnnpack::WorkspaceSharingMode;
+using executorch::backends::xnnpack::xnnpack_backend_key;
+using executorch::extension::Module;
+using executorch::extension::TensorPtr;
+using executorch::runtime::BackendOption;
+using executorch::runtime::BackendOptions;
+using executorch::runtime::Error;
+
+TensorPtr create_input_tensor(float val);
+void run_and_validate_two_models(
+    std::optional<WorkspaceSharingMode> mode1 = std::nullopt,
+    std::optional<WorkspaceSharingMode> mode2 = std::nullopt);
+void set_and_check_workspace_sharing_mode(WorkspaceSharingMode mode);
+
+TEST(WorkspaceSharing, SetMode) {
+  // Try setting and reading back the mode a few times.
+  set_and_check_workspace_sharing_mode(WorkspaceSharingMode::Disabled);
+  set_and_check_workspace_sharing_mode(WorkspaceSharingMode::PerModel);
+  set_and_check_workspace_sharing_mode(WorkspaceSharingMode::Global);
+}
+
+TEST(WorkspaceSharing, SetInvalidMode) {
+  // Make sure we can't set an invalid mode.
+
+  // Set to an initial known value.
+  set_and_check_workspace_sharing_mode(WorkspaceSharingMode::PerModel);
+
+  // Set to a bad value.
+  BackendOptions<1> backend_options;
+  backend_options.set_option(workspace_sharing_mode_option_key, 70);
+
+  auto status = executorch::runtime::set_option(
+      xnnpack_backend_key, backend_options.view());
+  ASSERT_EQ(status, Error::InvalidArgument);
+
+  // Make sure the option is still set to a valid value.
+  BackendOption read_option;
+  strcpy(read_option.key, workspace_sharing_mode_option_key);
+  read_option.value = -1;
+  status = get_option(xnnpack_backend_key, read_option);
+
+  ASSERT_TRUE(
+      std::get<int>(read_option.value) ==
+      static_cast<int>(WorkspaceSharingMode::PerModel));
+}
+
+TEST(WorkspaceSharing, RunWithDisabledMode) {
+  // Load and run some PTEs with workspace sharing disabled.
+  run_and_validate_two_models(WorkspaceSharingMode::Disabled);
+}
+
+TEST(WorkspaceSharing, RunWithPerModelMode) {
+  // Load and run some PTEs with per-model workspace sharing.
+  run_and_validate_two_models(WorkspaceSharingMode::PerModel);
+}
+
+TEST(WorkspaceSharing, RunWithGlobalMode) {
+  // Load and run some PTEs with global workspace sharing.
+  run_and_validate_two_models(WorkspaceSharingMode::Global);
+}
+
+TEST(WorkspaceSharing, RunWithModeSwitch) {
+  // Check each pair of modes, loading one model in one mode and the other in
+  // the other mode.
+
+  std::array<WorkspaceSharingMode, 3> modes = {
+      WorkspaceSharingMode::Disabled,
+      WorkspaceSharingMode::PerModel,
+      WorkspaceSharingMode::Global};
+
+  for (auto i = 0; i < modes.size(); ++i) {
+    for (auto j = i + 1; j < modes.size(); ++j) {
+      run_and_validate_two_models(modes[i], modes[j]);
+    }
+  }
+}
+
+TensorPtr create_input_tensor(float val) {
+  // Create an f32 tensor with shape [10, 10, 10], matching the input of the
+  // test models.
+  std::vector<float> data(1000, val);
+
+  // Note that the tensor pointer takes ownership of the data vector.
+  return executorch::extension::make_tensor_ptr({10, 10, 10}, std::move(data));
+}
+
+void run_and_validate_two_models(
+    std::optional<WorkspaceSharingMode> mode1,
+    std::optional<WorkspaceSharingMode> mode2) {
+  // Load and run two models, verifying that the output tensors are correct,
+  // optionally setting sharing mode.
+
+  if (mode1) {
+    set_and_check_workspace_sharing_mode(*mode1);
+  }
+
+  Module mod1(std::getenv("ET_XNNPACK_GENERATED_ADD_LARGE_PTE_PATH"));
+
+  auto a = create_input_tensor(1.0);
+  auto b = create_input_tensor(2.0);
+  auto c = create_input_tensor(3.0);
+
+  auto result = mod1.forward({a, b, c});
+  EXPECT_TRUE(result.ok());
+
+  // Expected output is 2a + 2b + c.
+  auto output_val = 1.0 * 2 + 2.0 * 2 + 3.0;
+  auto& output_tensor = result.get()[0].toTensor();
+  for (auto i = 0; i < output_tensor.numel(); ++i) {
+    ASSERT_EQ(output_tensor.const_data_ptr<float>()[i], output_val);
+  }
+
+  if (mode2) {
+    set_and_check_workspace_sharing_mode(*mode2);
+  }
+
+  Module mod2(std::getenv("ET_XNNPACK_GENERATED_SUB_LARGE_PTE_PATH"));
+
+  auto result2 = mod2.forward({a, b, c});
+  EXPECT_TRUE(result2.ok());
+
+  // Expected output is zero (the subtract operations cancel out).
+  auto& output_tensor2 = result2.get()[0].toTensor();
+  for (auto i = 0; i < output_tensor2.numel(); ++i) {
+    ASSERT_EQ(output_tensor2.const_data_ptr<float>()[i], 0);
+  }
+
+  // Run mod1 again to validate that it gives correct results in the second mode
+  auto result3 = mod1.forward({a, b, c});
+  EXPECT_TRUE(result3.ok());
+
+  // Expected output is still 2a + 2b + c
+  auto& output_tensor3 = result3.get()[0].toTensor();
+  for (auto i = 0; i < output_tensor3.numel(); ++i) {
+    ASSERT_EQ(output_tensor3.const_data_ptr<float>()[i], output_val);
+  }
+}
+
+void set_and_check_workspace_sharing_mode(WorkspaceSharingMode mode) {
+  executorch::runtime::runtime_init();
+
+  BackendOptions<1> backend_options;
+  backend_options.set_option(
+      workspace_sharing_mode_option_key, static_cast<int>(mode));
+
+  auto status = executorch::runtime::set_option(
+      xnnpack_backend_key, backend_options.view());
+  ASSERT_EQ(status, Error::Ok);
+
+  // Read the option back to sanity check.
+  BackendOption read_option;
+  strcpy(read_option.key, workspace_sharing_mode_option_key);
+  read_option.value = -1;
+  status = get_option(xnnpack_backend_key, read_option);
+
+  ASSERT_TRUE(std::get<int>(read_option.value) == static_cast<int>(mode));
+}
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
index b2a56f6283d..568c3c4ec35 100644
--- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
+++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -18,7 +18,7 @@ using executorch::runtime::Span;
 using executorch::runtime::testing::TensorFactory;
 
 TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
-  XNNExecutor executor;
+  XNNExecutor executor({});
   xnn_subgraph_t subgraph = nullptr;
   xnn_runtime_t rt = nullptr;
   et_pal_init();
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
index f175e9655ea..04517c035fe 100644
--- a/backends/xnnpack/test/targets.bzl
+++ b/backends/xnnpack/test/targets.bzl
@@ -63,3 +63,26 @@ def define_common_targets():
                 "ET_MODULE_LINEAR_XNN_DATA_PATH": "$(location fbcode//executorch/test/models:exported_xnnpack_program_and_data[ModuleLinear.ptd])",
             },
     )
+
+    runtime.cxx_test(
+        name = "test_workspace_sharing",
+        srcs = ["runtime/test_workspace_sharing.cpp"],
+        deps = [
+                "//executorch/extension/module:module",
+                "//executorch/extension/tensor:tensor",
+                "//executorch/backends/xnnpack:xnnpack_backend",
+            ],
+            env = {
+                "ET_XNNPACK_GENERATED_ADD_LARGE_PTE_PATH": "$(location fbcode//executorch/test/models:exported_xnnp_delegated_programs[ModuleAddLarge.pte])",
+                "ET_XNNPACK_GENERATED_SUB_LARGE_PTE_PATH": "$(location fbcode//executorch/test/models:exported_xnnp_delegated_programs[ModuleSubLarge.pte])",
+            },
+    )
+
+    runtime.cxx_test(
+        name = "test_workspace_manager",
+        srcs = ["runtime/test_workspace_manager.cpp"],
+        deps = [
+                third_party_dep("XNNPACK"),
+                "//executorch/backends/xnnpack:xnnpack_backend",
+            ],
+    )
diff --git a/backends/xnnpack/xnnpack_preprocess.py b/backends/xnnpack/xnnpack_preprocess.py
index 05fb53a837d..cdceb8a90a1 100644
--- a/backends/xnnpack/xnnpack_preprocess.py
+++ b/backends/xnnpack/xnnpack_preprocess.py
@@ -71,6 +71,11 @@ def generate_node_to_external_map(
         if node.op == "output":
             for output_nodes in node.args:
                 for output_node in output_nodes:
+                    if output_node in node_to_external_map:
+                        raise RuntimeError(
+                            f"Output node '{output_node}' is already in the inputs. "
+                            "This is likely due to pass through arguments, which are not supported in XNNPACK Delegate."
+                        )
                     node_to_external_map[output_node] = ExternalMeta(
                         external_id=len(node_to_external_map),
                         io_type=XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
diff --git a/codegen/tools/CMakeLists.txt b/codegen/tools/CMakeLists.txt
index 489a96aafb6..2d61a4d68c1 100644
--- a/codegen/tools/CMakeLists.txt
+++ b/codegen/tools/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -24,10 +25,23 @@ target_include_directories(
 
 # Compile options
 target_compile_options(
-  selective_build PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
+  selective_build
+  PUBLIC -Wno-deprecated-declarations
+         -fPIC
+         -frtti
+         -fexceptions
+         -Werror
+         -Wunused-variable
+         -Wno-unknown-argument
 )
+# We suppress -Wno-unknown-argument because our build system passes -fPIC for
+# Unix builds, but we also build on Windows where it's ignored
 
 # Link against required libraries
+if(TARGET bundled_program)
+  target_compile_definitions(selective_build PRIVATE -DET_BUNDLE_IO)
+  target_link_libraries(selective_build PRIVATE bundled_program)
+endif()
 target_link_libraries(selective_build PRIVATE executorch_core program_schema)
 
 # Install the module
diff --git a/codegen/tools/combine_prim_ops_headers.py b/codegen/tools/combine_prim_ops_headers.py
new file mode 100644
index 00000000000..b579de2047d
--- /dev/null
+++ b/codegen/tools/combine_prim_ops_headers.py
@@ -0,0 +1,164 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Script to combine multiple selected_prim_ops.h header files into a single header.
+This is used by selected_prim_operators_genrule to merge prim ops headers from dependencies.
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import List, Set
+
+
+def read_header_file(file_path: Path) -> Set[str]:
+    """
+    Read a selected_prim_ops.h file and extract the macros and comments.
+
+    Args:
+        file_path: Path to the header file
+
+    Returns:
+        macros_set where macros_set contains unique macro defines
+    """
+    macros = set()
+
+    try:
+        with open(file_path, "r") as f:
+            for line in f:
+                line = line.strip()
+
+                # Extract #define statements for prim ops
+                if line.startswith("#define INCLUDE_") and not line.startswith(
+                    "#define EXECUTORCH_ENABLE"
+                ):
+                    macros.add(line)
+    except FileNotFoundError:
+        print(f"Warning: Header file not found: {file_path}", file=sys.stderr)
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}", file=sys.stderr)
+
+    return macros
+
+
+def combine_prim_ops_headers(header_file_paths: List[str], output_path: str) -> None:
+    """
+    Combine multiple selected_prim_ops.h files into a single header.
+
+    Args:
+        header_files: List of paths to header files to combine
+        output_path: Path to output the combined header
+    """
+    all_macros = set()
+    has_selective_build = False
+
+    # Read all header files and collect unique macros
+    for header_file_path in header_file_paths:
+        header_file = Path(header_file_path) / "selected_prim_ops.h"
+        if os.path.exists(header_file):
+            macros = read_header_file(header_file)
+            all_macros.update(macros)
+            if len(all_macros) > 0:
+                has_selective_build = True
+        else:
+            print(
+                f"Warning: Header file does not exist: {header_file}", file=sys.stderr
+            )
+
+    # Generate combined header
+    header_content = [
+        "// Combined header for selective prim ops build",
+        "// This file is auto-generated by combining multiple selected_prim_ops.h files",
+        "// Do not edit manually.",
+        "",
+        "#pragma once",
+        "",
+    ]
+
+    if all_macros and has_selective_build:
+        header_content.extend(
+            [
+                "// Enable selective build for prim ops",
+                "#define EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD",
+                "",
+                "// Combined prim ops macros from all dependencies",
+            ]
+        )
+
+        # Sort macros for deterministic output
+        sorted_macros = sorted(all_macros)
+        header_content.extend(sorted_macros)
+    else:
+        header_content.extend(
+            [
+                "// No prim ops found in dependencies - all prim ops will be included",
+                "// Selective build is disabled",
+            ]
+        )
+
+    header_content.append("")
+
+    # Write the combined header
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write("\n".join(header_content))
+
+
+def _get_header_file_paths_from_query_output(query_output_file: str) -> List[str]:
+    """
+    Parse the output of a Buck query command to extract header file paths.
+
+    Args:
+        query_output_file: Path to the file containing the query output
+
+    Returns:
+        List of header file paths
+    """
+    header_file_paths = []
+    assert (
+        query_output_file[0] == "@"
+    ), "query_output_file is not a valid file path, or it doesn't start with '@'."
+    query_output_file = query_output_file[1:]
+
+    with open(query_output_file, "r") as f:
+        for line in f:
+            # Extract the header file path from the query output
+            header_file_paths += line.split()
+    return header_file_paths
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Combine multiple selected_prim_ops.h header files"
+    )
+    parser.add_argument(
+        "--header_files",
+        required=True,
+        help="Comma-separated list of header file paths",
+    )
+    parser.add_argument(
+        "--output_dir", required=True, help="Output directory for combined header"
+    )
+
+    args = parser.parse_args()
+    import os
+
+    header_file_paths = _get_header_file_paths_from_query_output(args.header_files)
+
+    if not header_file_paths:
+        print("Error: No header files provided", file=sys.stderr)
+        sys.exit(1)
+
+    # Generate output path
+    output_path = os.path.join(args.output_dir, "selected_prim_ops.h")
+
+    combine_prim_ops_headers(header_file_paths, output_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/codegen/tools/gen_all_oplist.py b/codegen/tools/gen_all_oplist.py
index 5cb93bb9153..f33c3dc935d 100644
--- a/codegen/tools/gen_all_oplist.py
+++ b/codegen/tools/gen_all_oplist.py
@@ -10,7 +10,7 @@
 import sys
 from functools import reduce
 from pathlib import Path
-from typing import Any, List
+from typing import Any, Dict, List
 
 import yaml
 from torchgen.selective_build.selector import (
@@ -72,6 +72,19 @@ def _raise_if_check_prim_ops_fail(options):
             raise Exception(error)
 
 
+def _selected_ops_model_dict_is_empty(model_dict: Dict[str, Any]) -> bool:
+    return (
+        not model_dict.get("build_features", [])
+        and not model_dict.get("custom_classes", [])
+        and not model_dict.get("et_kernel_metadata", None)
+        and not model_dict.get("include_all_non_op_selectives", False)
+        and not model_dict.get("include_all_operators", False)
+        and not model_dict.get("kernel_metadata", {})
+        and not model_dict.get("operators", {})
+    )
+
+
+# flake8: noqa: C901
 def main(argv: List[Any]) -> None:
     """This binary generates 3 files:
 
@@ -171,6 +184,11 @@ def main(argv: List[Any]) -> None:
                 ), f"{model_file_name} is not a valid file path. This is likely a BUCK issue."
                 with open(model_file_name, "rb") as model_file:
                     model_dict = yaml.safe_load(model_file)
+                    # It is possible that we created an empty yaml file.
+                    # This is because et_operator_library may only contain prim ops.
+                    # In that case selected_operators.yaml will be empty.
+                    if _selected_ops_model_dict_is_empty(model_dict):
+                        continue
                     resolved = resolve_model_file_path_to_buck_target(model_file_name)
                     for op in model_dict["operators"]:
                         model_dict["operators"][op]["debug_info"] = [resolved]
diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py
index cca5bf1b1d2..28506050a8e 100644
--- a/codegen/tools/gen_oplist.py
+++ b/codegen/tools/gen_oplist.py
@@ -9,6 +9,7 @@
 import os
 import sys
 from enum import IntEnum
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Set
 
 import yaml
@@ -158,7 +159,7 @@ def _get_et_kernel_metadata_from_ops_yaml(ops_yaml_path: str) -> Dict[str, List[
 
 def _dump_yaml(
     op_list: List[str],
-    output_path: str,
+    output_path: Path,
     model_name: Optional[str] = None,
     et_kernel_metadata: Optional[Dict[str, List[str]]] = None,
     include_all_operators: bool = False,
@@ -212,20 +213,23 @@ def create_kernel_key(maybe_kernel_key: str) -> str:
 
 
 def gen_oplist(
-    output_path: str,
+    output_path: Path,
     model_file_path: Optional[str] = None,
     ops_schema_yaml_path: Optional[str] = None,
     root_ops: Optional[str] = None,
     ops_dict: Optional[str] = None,
     include_all_operators: bool = False,
 ):
-    assert (
+    if not (
         model_file_path
         or ops_schema_yaml_path
         or root_ops
         or ops_dict
         or include_all_operators
-    ), "Need to provide either model_file_path or ops_schema_yaml_path or root_ops or ops_dict or include_all_operators."
+    ):
+        # dump empty yaml file
+        _dump_yaml([], output_path)
+        return
 
     assert output_path, "Need to provide output_path for dumped yaml file."
     op_set = set()
@@ -326,9 +330,15 @@ def main(args: List[Any]) -> None:
     )
     options = parser.parse_args(args)
 
+    # check if the output_path is a directory, then generate operators
+    # under selected_operators.yaml
+    if Path(options.output_path).is_dir():
+        output_path = Path(options.output_path) / "selected_operators.yaml"
+    else:
+        output_path = Path(options.output_path)
     try:
         gen_oplist(
-            output_path=options.output_path,
+            output_path=output_path,
             model_file_path=options.model_file_path,
             ops_schema_yaml_path=options.ops_schema_yaml_path,
             root_ops=options.root_ops,
diff --git a/codegen/tools/gen_selected_prim_ops.py b/codegen/tools/gen_selected_prim_ops.py
new file mode 100644
index 00000000000..4535ffaa57a
--- /dev/null
+++ b/codegen/tools/gen_selected_prim_ops.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import argparse
+import os
+import sys
+from typing import Any, List
+
+from torchgen.code_template import CodeTemplate  # type: ignore[import-not-found]
+
+
+selected_prim_ops_h_template_str = """#pragma once
+/**
+ * Generated by executorch/codegen/tools/gen_selected_prim_ops.py
+ */
+
+$defines
+"""
+selected_prim_ops_h_template = CodeTemplate(selected_prim_ops_h_template_str)
+
+
+def normalize_op_name(op_name: str) -> str:
+    """
+    Normalize an operator name to a macro-safe format.
+    Convert op names like "executorch_prim::et_view.default" to "EXECUTORCH_PRIM_ET_VIEW_DEFAULT"
+    or "aten::sym_size.int" to "ATEN_SYM_SIZE_INT"
+    """
+    # Remove namespace separator and replace with underscore
+    normalized = op_name.replace("::", "_")
+    # Replace dots with underscores
+    normalized = normalized.replace(".", "_")
+    # Convert to uppercase
+    normalized = normalized.upper()
+    # Add INCLUDE_ prefix
+    normalized = f"INCLUDE_{normalized}"
+    return normalized
+
+
+def write_selected_prim_ops(prim_op_names: List[str], output_dir: str) -> None:
+    """
+    Generate selected_prim_ops.h from a list of prim op names.
+
+    Args:
+        prim_op_names: List of prim op names like ["executorch_prim::et_view.default", "aten::sym_size.int"]
+        output_dir: Directory where to write selected_prim_ops.h
+    """
+    # Generate #define statements for each op
+    defines = []
+    for op_name in prim_op_names:
+        macro_name = normalize_op_name(op_name)
+        defines.append(f"#define {macro_name}")
+
+    # Join all defines with newlines
+    defines_str = "\n".join(defines)
+
+    # Generate header content
+    header_contents = selected_prim_ops_h_template.substitute(defines=defines_str)
+
+    # Write to file
+    selected_prim_ops_path = os.path.join(output_dir, "selected_prim_ops.h")
+    with open(selected_prim_ops_path, "wb") as out_file:
+        out_file.write(header_contents.encode("utf-8"))
+
+
+def main(argv: List[Any]) -> None:
+    parser = argparse.ArgumentParser(description="Generate selected prim ops header")
+    parser.add_argument(
+        "--prim-op-names",
+        "--prim_op_names",
+        help="Comma-separated list of prim op names to include",
+        required=True,
+    )
+    parser.add_argument(
+        "--output-dir",
+        "--output_dir",
+        help="The directory to store the output header file (selected_prim_ops.h)",
+        required=True,
+    )
+
+    options = parser.parse_args(argv)
+
+    # Parse comma-separated prim op names
+    prim_op_names = [
+        name.strip() for name in options.prim_op_names.split(",") if name.strip()
+    ]
+
+    write_selected_prim_ops(prim_op_names, options.output_dir)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/codegen/tools/selective_build.cpp b/codegen/tools/selective_build.cpp
index d33ff12ec9f..a34789e129d 100644
--- a/codegen/tools/selective_build.cpp
+++ b/codegen/tools/selective_build.cpp
@@ -1,16 +1,21 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
+ * Copyright 2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/schema/program_generated.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include <executorch/runtime/platform/assert.h>
-#include <executorch/schema/program_generated.h>
+#ifdef ET_BUNDLE_IO
+#include <executorch/devtools/bundled_program/bundled_program.h>
+#include <stdexcept>
+#endif
 
 namespace py = pybind11;
 
@@ -186,8 +191,39 @@ get_kernel_tensor_metadatas_from_execution_plan(
 
 const executorch_flatbuffer::Program* _get_program_from_buffer(
     const py::bytes& buffer) {
+  // Access the Python bytes without copying and get raw pointer/size.
+  const std::string_view sv = buffer.cast<std::string_view>();
+#ifdef ET_BUNDLE_IO
+  void* buf_ptr = const_cast<void*>(static_cast<const void*>(sv.data()));
+  const size_t buf_len = sv.size();
+
+  // If this is a bundled program, extract the inner ExecuTorch program bytes.
+  if (executorch::bundled_program::is_bundled_program(buf_ptr, buf_len)) {
+    const void* program_data = nullptr;
+    size_t program_size = 0;
+
+    const auto status = executorch::bundled_program::get_program_data(
+        buf_ptr, // serialized BundledProgram start
+        buf_len, // total size of the BundledProgram blob
+        &program_data, // [out] pointer to inner .pte bytes
+        &program_size // [out] size of inner .pte bytes
+    );
+
+    if (status != ::executorch::runtime::Error::Ok || program_data == nullptr ||
+        program_size == 0) {
+      throw std::runtime_error(
+          "bundled_program::get_program_data() failed or returned empty data");
+    }
+
+    // program_data points directly at the flatbuffer-encoded Program region.
+    return executorch_flatbuffer::GetProgram(
+        reinterpret_cast<const uint8_t*>(program_data));
+  }
+#endif
+  // Otherwise treat the buffer as a raw .pte (flatbuffer Program with optional
+  // extended header).
   return executorch_flatbuffer::GetProgram(
-      buffer.cast<std::string_view>().data());
+      reinterpret_cast<const uint8_t*>(sv.data()));
 }
 
 py::list _get_program_operators(const executorch_flatbuffer::Program* program) {
diff --git a/codegen/tools/targets.bzl b/codegen/tools/targets.bzl
index acea3370e7d..c11982409f0 100644
--- a/codegen/tools/targets.bzl
+++ b/codegen/tools/targets.bzl
@@ -17,10 +17,8 @@ def define_common_targets(is_fbcode = False):
         ],
         deps = [
             "//executorch/codegen:gen_lib",
-        ] + ([] if runtime.is_oss else select({
-            "DEFAULT": [],
-            "ovr_config//os:linux": ["//executorch/codegen/tools:selective_build"],  # TODO(larryliu0820) :selective_build doesn't build in OSS yet
-        })),
+            "//executorch/codegen/tools:selective_build",
+        ],
     )
 
     runtime.python_binary(
@@ -29,7 +27,7 @@ def define_common_targets(is_fbcode = False):
         deps = [
             ":gen_oplist_lib",
         ],
-        preload_deps = [] if runtime.is_oss else ["//executorch/codegen/tools:selective_build"],  # TODO(larryliu0820) :selective_build doesn't build in OSS yet
+        preload_deps = ["//executorch/codegen/tools:selective_build"],
         package_style = "inplace",
         visibility = [
             "//executorch/...",
@@ -103,6 +101,26 @@ def define_common_targets(is_fbcode = False):
         _is_external_target = True,
     )
 
+    runtime.python_library(
+        name = "combine_prim_ops_headers_lib",
+        srcs = ["combine_prim_ops_headers.py"],
+        base_module = "executorch.codegen.tools",
+        visibility = ["//executorch/..."],
+    )
+
+    runtime.python_binary(
+        name = "combine_prim_ops_headers",
+        main_module = "executorch.codegen.tools.combine_prim_ops_headers",
+        package_style = "inplace",
+        visibility = [
+            "PUBLIC",
+        ],
+        deps = [
+            ":combine_prim_ops_headers_lib",
+        ],
+        _is_external_target = True,
+    )
+
     runtime.python_test(
         name = "test_gen_all_oplist",
         srcs = [
@@ -155,27 +173,48 @@ def define_common_targets(is_fbcode = False):
         _is_external_target = True,
     )
 
-    if not runtime.is_oss:
-        runtime.cxx_python_extension(
-            name = "selective_build",
-            srcs = [
-                "selective_build.cpp",
-            ],
-            base_module = "executorch.codegen.tools",
-            types = ["selective_build.pyi"],
-            preprocessor_flags = [
-                "-DEXECUTORCH_PYTHON_MODULE_NAME=selective_build",
-            ],
-            deps = [
-                "//executorch/runtime/core:core",
-                "//executorch/schema:program",
-            ],
-            external_deps = [
-                "pybind11",
-            ],
-            use_static_deps = True,
-            visibility = ["//executorch/codegen/..."],
-        )
+    runtime.python_library(
+        name = "gen_selected_prim_ops_lib",
+        srcs = ["gen_selected_prim_ops.py"],
+        base_module = "executorch.codegen.tools",
+        visibility = ["//executorch/..."],
+        external_deps = ["torchgen"],
+    )
+
+    runtime.python_binary(
+        name = "gen_selected_prim_ops",
+        main_module = "executorch.codegen.tools.gen_selected_prim_ops",
+        package_style = "inplace",
+        visibility = [
+            "PUBLIC",
+        ],
+        deps = [
+            ":gen_selected_prim_ops_lib",
+        ],
+        _is_external_target = True,
+    )
+
+    
+    runtime.cxx_python_extension(
+        name = "selective_build",
+        srcs = [
+            "selective_build.cpp",
+        ],
+        base_module = "executorch.codegen.tools",
+        types = ["selective_build.pyi"],
+        preprocessor_flags = [
+            "-DEXECUTORCH_PYTHON_MODULE_NAME=selective_build",
+        ],
+        deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/schema:program",
+        ],
+        external_deps = [
+            "pybind11",
+        ],
+        use_static_deps = True,
+        visibility = ["//executorch/codegen/..."],
+    )
 
 
     # TODO(larryliu0820): This is a hack to only run these two on fbcode. These targets depends on exir which is only available in fbcode.
@@ -214,10 +253,12 @@ def define_common_targets(is_fbcode = False):
             ],
         )
 
+    if runtime.is_oss or is_fbcode:
+        # Doesn't work on xplat. But works on fbcode and OSS.
         runtime.python_test(
-            name = "test_selective_build",
+            name = "test_tools_selective_build",
             srcs = [
-                "test/test_selective_build.py",
+                "test/test_tools_selective_build.py",
             ],
             package_style = "inplace",
             visibility = [
diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py
index f5c6829d6a0..18689cd2505 100644
--- a/codegen/tools/test/test_gen_oplist.py
+++ b/codegen/tools/test/test_gen_oplist.py
@@ -8,6 +8,7 @@
 import os
 import tempfile
 import unittest
+from pathlib import Path
 from typing import Dict, List
 from unittest.mock import NonCallableMock, patch
 
@@ -77,7 +78,7 @@ def test_gen_op_list_with_valid_root_ops(
         gen_oplist.main(args)
         mock_dump_yaml.assert_called_once_with(
             ["aten::add", "aten::mul"],
-            output_path,
+            Path(output_path),
             None,
             {"aten::add": ["default"], "aten::mul": ["default"]},
             False,
@@ -100,7 +101,7 @@ def test_gen_op_list_with_root_ops_and_dtypes(
         gen_oplist.main(args)
         mock_dump_yaml.assert_called_once_with(
             ["aten::add", "aten::mul"],
-            output_path,
+            Path(output_path),
             None,
             {
                 "aten::add": [
@@ -129,7 +130,7 @@ def test_gen_op_list_with_both_op_list_and_ops_schema_yaml_merges(
         gen_oplist.main(args)
         mock_dump_yaml.assert_called_once_with(
             ["aten::add.out", "aten::mul.out", "aten::relu.out"],
-            output_path,
+            Path(output_path),
             test_path,
             {
                 "aten::relu.out": ["default"],
@@ -153,7 +154,7 @@ def test_gen_op_list_with_include_all_operators(
         gen_oplist.main(args)
         mock_dump_yaml.assert_called_once_with(
             ["aten::add", "aten::mul"],
-            output_path,
+            Path(output_path),
             None,
             {"aten::add": ["default"], "aten::mul": ["default"]},
             True,
@@ -164,7 +165,7 @@ def test_get_custom_build_selector_with_both_allowlist_and_yaml(
     ) -> None:
         op_list = ["aten::add", "aten::mul"]
         filename = os.path.join(self.temp_dir.name, "selected_operators.yaml")
-        gen_oplist._dump_yaml(op_list, filename, "model.pte")
+        gen_oplist._dump_yaml(op_list, Path(filename), "model.pte")
         self.assertTrue(os.path.isfile(filename))
         with open(filename) as f:
             es = yaml.safe_load(f)
diff --git a/codegen/tools/test/test_selective_build.py b/codegen/tools/test/test_tools_selective_build.py
similarity index 100%
rename from codegen/tools/test/test_selective_build.py
rename to codegen/tools/test/test_tools_selective_build.py
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index fa5412ac476..fb154ff88bc 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -63,6 +63,6 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   install(
     TARGETS optimized_native_cpu_ops_lib
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
diff --git a/devtools/etdump/tests/etdump_test.cpp b/devtools/etdump/tests/etdump_test.cpp
index d095844986f..fd35caca557 100644
--- a/devtools/etdump/tests/etdump_test.cpp
+++ b/devtools/etdump/tests/etdump_test.cpp
@@ -345,7 +345,7 @@ TEST_F(ProfilerETDumpTest, DebugEventTensorList) {
       EValue* values_p[2] = {&evalue_1, &evalue_2};
 
       BoxedEvalueList<executorch::aten::Tensor> a_box(values_p, storage, 2);
-      EValue evalue(a_box);
+      EValue evalue(&a_box);
       evalue.tag = Tag::ListTensor;
 
       etdump_gen[i]->create_event_block("test_block");
diff --git a/devtools/scripts/profile_model.sh b/devtools/scripts/profile_model.sh
index 8697c97cd02..a4d50f6c6fc 100755
--- a/devtools/scripts/profile_model.sh
+++ b/devtools/scripts/profile_model.sh
@@ -7,7 +7,7 @@
 
 #!/bin/bash
 
-# ExecutorTorch Model Profiling Script
+# ExecuTorch Model Profiling Script
 #
 # This script automates the process of building executor_runner with profiling enabled,
 # running model inference with ETDump collection, and generating CSV profiling reports.
diff --git a/docs/.gitignore b/docs/.gitignore
index 980fbad8320..b9b2a3753e5 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -3,3 +3,4 @@
 /sphinxbuild_py
 /sphinxbuild_cpp
 /src
+source/sg_execution_times.rst
diff --git a/docs/Makefile b/docs/Makefile
index 219998d4b4d..627358d0387 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -10,6 +10,9 @@ BUILDDIR      = _build
 
 # Put it first so that "make" without argument is like "make help".
 
+html-noplot:
+	$(SPHINXBUILD) -D plot_gallery=0 -b html $(SPHINXOPTS) "$(SOURCEDIR)" "$(BUILDDIR)/html"
+
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
diff --git a/docs/README.md b/docs/README.md
index e30decb9362..845267b32f6 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,7 +43,7 @@ To build the documentation locally:
    git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
    ```
 
-1. If you don't have it already, start either a Python virtual envitonment:
+1. If you don't have it already, start either a Python virtual environment:
 
    ```bash
    python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
@@ -111,7 +111,7 @@ You can use the variables in both regular text and code blocks.
 ## Including READMEs to the Documentation Build
 
 You might want to include some of the `README.md` files from various directories
-in this repositories in your documentation build. To do that, create an `.md`
+in this repository in your documentation build. To do that, create an `.md`
 file and use the `{include}` directive to insert your `.md` files. Example:
 
 ````
@@ -177,7 +177,7 @@ file:
 ````
 
 In the `index.md` file, I would add `tutorials/selective-build-tutorial` in
-both the `toctree` and the `cusotmcarditem` sections.
+both the `toctree` and the `customcarditem` sections.
 
 # Auto-generated API documentation
 
diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
deleted file mode 100644
index 3ae9585701e..00000000000
--- a/docs/source/_static/css/custom.css
+++ /dev/null
@@ -1,194 +0,0 @@
-/**
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* sphinx-design styles for cards/tabs
-*/
-:root {
-    --sd-color-info: #ee4c2c;
-    --sd-color-primary: #6c6c6d;
-    --sd-color-primary-highlight: #f3f4f7;
-    --sd-color-card-border-hover: #ee4c2c;
-    --sd-color-card-border: #f3f4f7;
-    --sd-color-card-background: #fff;
-    --sd-color-card-text: inherit;
-    --sd-color-card-header: transparent;
-    --sd-color-card-footer: transparent;
-    --sd-color-tabs-label-active: #ee4c2c;
-    --sd-color-tabs-label-hover: #ee4c2c;
-    --sd-color-tabs-label-inactive: #6c6c6d;
-    --sd-color-tabs-underline-active: #ee4c2c;
-    --sd-color-tabs-underline-hover: #fabdbd;
-    --sd-color-tabs-underline-inactive: transparent;
-    --sd-color-tabs-overline: rgb(222, 222, 222);
-    --sd-color-tabs-underline: rgb(222, 222, 222);
-}
-
-.sd-text-info {
-    color: #ee4c2c;
-}
-
-.sd-card-img-top {
-    background: #ee4c2c;
-    height: 5px !important;
-}
-
-.sd-card {
-    position: relative;
-    background-color: #fff;
-    opacity: 1.0;
-    border-radius: 0px;
-    width: 30%;
-    border: none;
-    padding-bottom: 0px;
-}
-
-
-.sd-card-img:hover {
-    opacity: 1.0;
-    background-color: #f3f4f7;
-}
-
-
-.sd-card:after {
-    display: block;
-    opacity: 1;
-    content: '';
-    border-bottom: solid 1px #ee4c2c;
-    background-color: #fff;
-    transform: scaleX(0);
-    transition: transform .250s ease-in-out;
-    transform-origin:  0% 50%;
-}
-
-.sd-card:hover {
-    background-color: #fff;
-    opacity: 1;
-    border-top: 1px solid #f3f4f7;
-    border-left: 1px solid #f3f4f7;
-    border-right: 1px solid #f3f4f7;
-}
-
-.sd-card:hover:after {
-    transform: scaleX(1);
-}
-
-.card-prerequisites:hover {
-    transition: none;
-    border: none;
-}
-
-.card-prerequisites:hover:after {
-    transition: none;
-    transform: none;
-}
-
-.card-prerequisites:after {
-    display: block;
-    content: '';
-    border-bottom: none;
-    background-color: #fff;
-    transform: none;
-    transition: none;
-    transform-origin: none;
-}
-
-
-details.sd-dropdown {
-    font-weight: 300;
-    width: auto;
-}
-
-details.sd-dropdown:after {
-    border: none;
-    transition: none;
-}
-
-details.sd-dropdown:hover {
-    border: none;
-    transition: none;
-}
-
-details.sd-dropdown .sd-summary-content {
-    font-weight: 300;
-}
-
-details.sd-dropdown .highlight .n {
-    font-weight: normal;
-}
-
-.et-page-column1 {
-  float: left;
-  width: 70%;
-  font-size: 1rem;
-}
-
-.et-page-column2 {
-  float: right;
-  padding-top: 40px;
-  padding-left: 60px;
-  padding-right: 60px;
-  padding-bottom: 60px;
-  width: 30%;
-}
-
-.et-page-column-row:after {
-  content: "";
-  display: table;
-  clear: both;
-}
-
-/* For screens smaller than 768px (typical mobile devices) */
-@media screen and (max-width: 768px) {
-  .et-page-column1, .et-page-column2 {
-    float: none; /* Remove floats */
-    width: 100%; /* Full width for both columns */
-    padding: 0;
-    font-size: 1rem;
-  }
-
-  .et-page-column2 img {
-    display: none;
-  }
-  .et-page-column-row:after {
-    content: "";
-    display: table;
-    clear: both;
-  }
-}
-
-article.pytorch-article .class .method dt {
-    border-top: none;
-}
-
-article.pytorch-article .class .simple dt {
-    border-top: none;
-}
-
-article.pytorch-article .function dt.sig {
-    border-top: none;
-}
-
-/* styles needed for 3rd level left nav */
-
-.pytorch-left-menu ul, .pytorch-right-menu ul {
-    margin-left: 1.2em;
-}
-
-.pytorch-left-menu li.toctree-l2.current > a {
-    color: #e44c2c;
-}
-
-/* The next two styles enable normal hihglighting in the third level nav
-in right side bar.*/
-#pytorch-right-menu .side-scroll-highlight {
-    color: #6c6c6d;
-}
-
-#pytorch-right-menu a.reference.internal.side-scroll-highlight-local {
-  color: #ee4c2c;
-}
diff --git a/docs/source/_static/css/progress-bar.css b/docs/source/_static/css/progress-bar.css
deleted file mode 100644
index 9b3aeb9d301..00000000000
--- a/docs/source/_static/css/progress-bar.css
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-.progress-bar-wrapper {
-  margin-top: auto;
-  display: flex;
-  justify-content: space-between;
-  margin-bottom: 20px;
-  position: sticky;
-  top: 0;
-  background: white;
-  padding-top: 20px;
-  padding-bottom: 20px;
-  z-index: 2;
-}
-
-.progress-bar-item {
-  position: relative;
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  flex: 1;
-
-  @media (max-width: 768px) {
-     font-size: 12px;
-   }
-}
-
-.progress-bar-item::before {
-  position: absolute;
-  content: "";
-  border-bottom: 2px solid #ccc;
-  width: 100%;
-  top: 20px;
-  left: -50%;
-  z-index: 2;
-}
-
-.progress-bar-item::after {
-  position: absolute;
-  content: "";
-  border-bottom: 2px solid #ccc;
-  width: 100%;
-  top: 20px;
-  left: 50%;
-  z-index: 2;
-}
-
-.progress-bar-item .step-number {
-  position: relative;
-  z-index: 5;
-  display: flex;
-  justify-content: center;
-  align-items: center;
-  width: 40px;
-  height: 40px;
-  border-radius: 50%;
-  border-color: #812CE5;
-  border-style: solid;
-  border-width: 1px;
-  color: #812CE5;
-  background: #fff;
-  margin-bottom: 6px;
-}
-
-.progress-bar-item.active {
-  font-weight: bold;
-}
-
-.progress-bar-item.completed .step-number {
-  background-color: #812CE5;
-  color: white;
-}
-
-.progress-bar-item.completed::after {
-  position: absolute;
-  content: "";
-  border-bottom: 2px solid #812CE5;
-  width: 100%;
-  top: 20px;
-  left: 50%;
-  z-index: 3;
-}
-
-.progress-bar-item:first-child::before {
-  content: none;
-}
-
-.progress-bar-item:last-child::after {
-  content: none;
-}
-
-.progress-bar-item a:link {
-    color: #262626 !important;
-}
-
-.step-caption:first-child {
-    margin-left: 10px;
-}
-
-.step-caption {
-    text-align: center;
-}
-
-.step-caption a:link {
-    color: #262626 !important;
-}
-
-.step-caption a:hover {
-    color: #ee4c2c;
-    text-decoration: underline;
-}
diff --git a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg b/docs/source/_static/img/ExecuTorch-Logo-cropped.svg
deleted file mode 100644
index 9e0ef52fbd8..00000000000
--- a/docs/source/_static/img/ExecuTorch-Logo-cropped.svg
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg
-   id="Layer_2"
-   viewBox="0 0 51.200001 38.52"
-   width="51.200001"
-   height="38.52"
-   version="1.1"
-   sodipodi:docname="ExecuTorch-Logo-cropped.svg"
-   inkscape:version="1.2.1 (9c6d41e4, 2022-07-14)"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:svg="http://www.w3.org/2000/svg">
-  <sodipodi:namedview
-     id="namedview15"
-     pagecolor="#ffffff"
-     bordercolor="#000000"
-     borderopacity="0.25"
-     inkscape:showpageshadow="2"
-     inkscape:pageopacity="0.0"
-     inkscape:pagecheckerboard="0"
-     inkscape:deskcolor="#d1d1d1"
-     showgrid="false"
-     inkscape:zoom="8.0613964"
-     inkscape:cx="18.235054"
-     inkscape:cy="7.6289512"
-     inkscape:window-width="1680"
-     inkscape:window-height="819"
-     inkscape:window-x="0"
-     inkscape:window-y="25"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="Layer_2" />
-  <defs
-     id="defs4">
-    <style
-       id="style2">.cls-1{fill:#cc2faa;}</style>
-  </defs>
-  <path
-     class="cls-1"
-     d="m 26.89,12.15 c 1.27,-1.27 3.33,-1.27 4.59,0 1.26,1.27 1.26,3.32 0,4.59 -1.26,1.27 -3.33,1.27 -4.59,0 -1.26,-1.27 -1.26,-3.32 0,-4.59"
-     id="path6" />
-  <polygon
-     class="cls-1"
-     points="16.1,27.25 16.11,21.52 39.95,45.19 51.49,45.17 51.53,22.49 55.6,18.42 55.55,49.23 38.27,49.26 "
-     id="polygon8"
-     transform="translate(-4.4,-10.74)" />
-  <polygon
-     class="cls-1"
-     points="4.4,41.62 4.45,10.77 21.74,10.74 30.38,19.31 27.5,22.19 20.05,14.81 8.52,14.83 8.48,37.55 "
-     id="polygon10"
-     transform="translate(-4.4,-10.74)" />
-  <polygon
-     class="cls-1"
-     points="39.52,28.41 44.48,33.33 44.47,39.06 36.66,31.31 "
-     id="polygon12"
-     transform="translate(-4.4,-10.74)" />
-</svg>
diff --git a/docs/source/_static/img/executorch-chip-logo-circle-16.png b/docs/source/_static/img/executorch-chip-logo-circle-16.png
new file mode 100644
index 00000000000..a3966ae27db
Binary files /dev/null and b/docs/source/_static/img/executorch-chip-logo-circle-16.png differ
diff --git a/docs/source/_static/img/executorch-chip-logo-circle-32.png b/docs/source/_static/img/executorch-chip-logo-circle-32.png
new file mode 100644
index 00000000000..83f1018a76c
Binary files /dev/null and b/docs/source/_static/img/executorch-chip-logo-circle-32.png differ
diff --git a/docs/source/_static/img/executorch-chip-logo.svg b/docs/source/_static/img/executorch-chip-logo.svg
new file mode 100644
index 00000000000..11e5ed60956
--- /dev/null
+++ b/docs/source/_static/img/executorch-chip-logo.svg
@@ -0,0 +1,205 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   width="45.129288mm"
+   height="45.129242mm"
+   viewBox="0 0 45.129288 45.129242"
+   version="1.1"
+   id="svg1124"
+   inkscape:version="1.2.1 (9c6d41e, 2022-07-14)"
+   sodipodi:docname="executorch-chip-logo.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview1126"
+     pagecolor="#ffffff"
+     bordercolor="#000000"
+     borderopacity="0.25"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     inkscape:document-units="mm"
+     showgrid="false"
+     inkscape:zoom="2.3786088"
+     inkscape:cx="20.600277"
+     inkscape:cy="32.161657"
+     inkscape:current-layer="layer1" />
+  <defs
+     id="defs1121">
+    <linearGradient
+       id="linearGradient2449"
+       inkscape:swatch="solid">
+      <stop
+         style="stop-color:#ffffff;stop-opacity:1;"
+         offset="0"
+         id="stop2447" />
+    </linearGradient>
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-15.818847,-61.123938)">
+    <g
+       id="g2797"
+       transform="matrix(0.90140816,0,0,0.90140816,3.7842987,8.2510089)">
+      <rect
+         style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+         id="rect2207"
+         width="28.699108"
+         height="28.699108"
+         x="-46.385078"
+         y="71.985069"
+         ry="2.4088593"
+         rx="2.4088593"
+         transform="rotate(-45)" />
+      <g
+         id="g2221"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2209"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1381.7832 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2211"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1400.6595 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2213"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1419.5359 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7655 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.951 -0.76573,-1.7168 -1.7168,-1.7167 z" />
+        <path
+           id="path2215"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1438.4121 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7167 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9513 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2217"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1457.2883 v 11.8926 h 10.86719 c 0.95107,0 1.7168,-0.7658 1.7168,-1.7168 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9512 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+        <path
+           id="path2219"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -341.83555,1362.907 v 11.8925 h 10.86719 c 0.95107,0 1.7168,-0.7657 1.7168,-1.7167 v -0.875 h 12.55273 c 0.95122,0 1.7168,-0.7656 1.7168,-1.7168 v -3.2754 c 0,-0.9513 -0.76558,-1.7168 -1.7168,-1.7168 h -12.55273 v -0.875 c 0,-0.9511 -0.76573,-1.7168 -1.7168,-1.7168 z" />
+      </g>
+      <g
+         id="g2235"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2223"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -444.51285,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2225"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -425.63655,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2227"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -406.76015,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7655,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.951,0 -1.7168,0.7657 -1.7167,1.7168 z" />
+        <path
+           id="path2229"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -387.88395,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7167,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9513,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2231"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -369.00775,1347.476 h 11.8926 v -10.8672 c 0,-0.9511 -0.7658,-1.7168 -1.7168,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9512,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+        <path
+           id="path2233"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -463.38905,1347.476 h 11.8925 v -10.8672 c 0,-0.9511 -0.7657,-1.7168 -1.7167,-1.7168 h -0.875 v -12.5527 c 0,-0.9512 -0.7656,-1.7168 -1.7168,-1.7168 h -3.2754 c -0.9513,0 -1.7168,0.7656 -1.7168,1.7168 v 12.5527 h -0.875 c -0.9511,0 -1.7168,0.7657 -1.7168,1.7168 z" />
+      </g>
+      <g
+         id="g2249"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2237"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1450.1533 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2239"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1431.277 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2241"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1412.4006 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7655 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.951 0.76573,1.7168 1.7168,1.7167 z" />
+        <path
+           id="path2243"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1393.5244 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7167 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9513 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2245"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1374.6482 v -11.8926 h -10.86719 c -0.95107,0 -1.7168,0.7658 -1.7168,1.7168 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9512 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+        <path
+           id="path2247"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -478.66849,1469.0295 v -11.8925 h -10.86719 c -0.95107,0 -1.7168,0.7657 -1.7168,1.7167 v 0.875 h -12.55273 c -0.95122,0 -1.7168,0.7656 -1.7168,1.7168 v 3.2754 c 0,0.9513 0.76558,1.7168 1.7168,1.7168 h 12.55273 v 0.875 c 0,0.9511 0.76573,1.7168 1.7168,1.7168 z" />
+      </g>
+      <g
+         id="g2263"
+         style="fill:#000000;fill-opacity:1;stroke-width:1.20129"
+         transform="matrix(0.15573902,-0.15573902,0.15573902,0.15573902,-118.24576,-200.72519)">
+        <path
+           id="path2251"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -375.99123,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2253"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -394.86753,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2255"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -413.74393,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7655,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.951,0 1.7168,-0.7658 1.7167,-1.7168 z" />
+        <path
+           id="path2257"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -432.62013,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7167,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9513,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2259"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -451.49633,1484.4604 h -11.8926 v 10.8672 c 0,0.951 0.7658,1.7168 1.7168,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9512,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+        <path
+           id="path2261"
+           style="font-variation-settings:normal;fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.4026;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+           d="m -357.11503,1484.4604 h -11.8925 v 10.8672 c 0,0.951 0.7657,1.7168 1.7167,1.7168 h 0.875 v 12.5527 c 0,0.9512 0.7656,1.7168 1.7168,1.7168 h 3.2754 c 0.9513,0 1.7168,-0.7656 1.7168,-1.7168 v -12.5527 h 0.875 c 0.9511,0 1.7168,-0.7658 1.7168,-1.7168 z" />
+      </g>
+      <rect
+         style="font-variation-settings:normal;fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:#ffffff;stroke-width:0.814388;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000"
+         id="rect2265"
+         width="23.585829"
+         height="23.585829"
+         x="-43.828445"
+         y="74.54174"
+         ry="0.11391187"
+         rx="0.11391187"
+         transform="rotate(-45)" />
+      <g
+         id="g2205"
+         transform="matrix(0.21958723,0,0,0.21958723,28.325015,-164.37637)"
+         style="display:inline;fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049">
+        <path
+           fill="#ee4c2c"
+           d="m 77.6,1099.6 -8.1,8.1 c 13.3,13.3 13.3,34.7 0,47.8 -13.3,13.3 -34.7,13.3 -47.8,0 -13.3,-13.3 -13.3,-34.7 0,-47.8 v 0 l 21.1,-21.1 3,-3 v 0 -15.9 L 14,1099.5 c -17.7,17.7 -17.7,46.3 0,64 17.7,17.7 46.3,17.7 63.7,0 17.6,-17.7 17.6,-46.1 -0.1,-63.9 z"
+           id="path2201"
+           style="fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049" />
+        <circle
+           fill="#ee4c2c"
+           cx="61.700001"
+           cy="1091.8"
+           r="5.9000001"
+           id="circle2203"
+           style="fill:#fb1620;fill-opacity:1;stroke:none;stroke-width:1.2049" />
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/docs/source/_static/js/progress-bar.js b/docs/source/_static/js/progress-bar.js
deleted file mode 100644
index 878251cfc60..00000000000
--- a/docs/source/_static/js/progress-bar.js
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-document.addEventListener("DOMContentLoaded", function() {
-  const steps = Array.from(document.querySelectorAll('.progress-bar-item'));
-  const h2s = Array.from(document.querySelectorAll('h2'));
-
-  // Populate captions from h2s
-  h2s.forEach((h2, index) => {
-    const captionElem = document.getElementById(`caption-${index + 1}`);
-    if (captionElem) {
-      captionElem.innerText = h2.innerText;
-    }
-  });
-
-  // Throttle function to optimize performance
-  function throttle(func, delay) {
-    let lastCall = 0;
-    return function() {
-      const now = Date.now();
-      if (now - lastCall < delay) return;
-      lastCall = now;
-      func.apply(this, arguments);
-    }
-  }
-
-  document.addEventListener("scroll", throttle(function() {
-    let activeIndex = 0;
-    let closestDistance = Number.MAX_VALUE;
-    const totalHeight = document.documentElement.scrollHeight;
-    const viewportHeight = window.innerHeight;
-    const scrollBottom = window.scrollY + viewportHeight;
-    const isAtBottom = totalHeight === scrollBottom;
-
-    h2s.forEach((h2, index) => {
-      const rect = h2.getBoundingClientRect();
-      const distanceToTop = Math.abs(rect.top);
-      if (distanceToTop < closestDistance) {
-        closestDistance = distanceToTop;
-        activeIndex = index;
-      }
-    });
-
-    steps.forEach((step, index) => {
-      if (isAtBottom) {
-        step.classList.remove('active');
-        step.classList.add('completed');
-      } else {
-        if (index < activeIndex) {
-          step.classList.remove('active');
-          step.classList.add('completed');
-        } else if (index === activeIndex) {
-          step.classList.add('active');
-          step.classList.remove('completed');
-        } else {
-          step.classList.remove('active', 'completed');
-        }
-      }
-    });
-  }, 100));
-});
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
deleted file mode 100644
index 55f91103b35..00000000000
--- a/docs/source/_templates/layout.html
+++ /dev/null
@@ -1,145 +0,0 @@
-{% extends "!layout.html" %}
-
-{% block extrahead %}
-{% if 'getting-started-setup' in pagename%}
-<link rel="stylesheet" href="../_static/css/progress-bar.css">
-<script src="../_static/js/progress-bar.js" defer></script>
-{% elif 'compiler-delegate-and-partitioner' in pagename%}
-<link rel="stylesheet" href="../_static/css/progress-bar.css">
-<script src="../_static/js/progress-bar.js" defer></script>
-{% elif 'xtensa' in pagename%}
-<link rel="stylesheet" href="../_static/css/progress-bar.css">
-<script src="../_static/js/progress-bar.js" defer></script>
-{% elif 'qualcomm-ai-engine-direct-backend' in pagename%}
-<link rel="stylesheet" href="../_static/css/progress-bar.css">
-<script src="../_static/js/progress-bar.js" defer></script>
-{% elif 'coreml' in pagename%}
-<link rel="stylesheet" href="../_static/css/progress-bar.css">
-<script src="../_static/js/progress-bar.js" defer></script>
-{% elif 'mps' in pagename%}
-<link rel="stylesheet" href="../_static/css/progress-bar.css">
-<script src="../_static/js/progress-bar.js" defer></script>
-{% endif %}
-{{ super() }}
-{% endblock %}
-
-
-{% block sidebartitle %}
-    <div class="version">
-      <a href='https://pytorch.org/executorch/versions.html'>{{ version }} &#x25BC</a>
-    </div>
-    {% include "searchbox.html" %}
-{% endblock %}
-
-{%- block content %}
-{% if 'tutorials' in pagename %}
-
-<div class="pytorch-call-to-action-links">
-  <div id="tutorial-type">{{ pagename }}</div>
-
-  <div id="google-colab-link">
-    <img class="call-to-action-img" src="{{ pathto('_static/images/pytorch-colab.svg', 1) }}" />
-    <div class="call-to-action-desktop-view">Run in Google Colab</div>
-    <div class="call-to-action-mobile-view">Colab</div>
-  </div>
-  <div id="download-notebook-link">
-    <img class="call-to-action-notebook-img" src="{{ pathto('_static/images/pytorch-download.svg', 1) }}" />
-    <div class="call-to-action-desktop-view">Download Notebook</div>
-    <div class="call-to-action-mobile-view">Notebook</div>
-  </div>
-  <div id="github-view-link">
-    <img class="call-to-action-img" src="{{ pathto('_static/images/pytorch-github.svg', 1) }}" />
-    <div class="call-to-action-desktop-view">View on GitHub</div>
-    <div class="call-to-action-mobile-view">GitHub</div>
-  </div>
-</div>
-
-{% endif %}
-{{ super() }}
-
-{% endblock %}
-
-<!-- START OF LOCAL OVERRIDE -->
-<!--  This block overrides the theme to enable third level left navigation.-->
-{% block menu %}
-    {% if 'singlehtml' not in builder %}
-         {% set global_toc = toctree(collapse=theme_collapse_navigation|tobool,
-                                     includehidden=theme_includehidden|tobool,
-                                     titles_only=theme_titles_only|tobool) %}
-         {% endif %}
-         {% if global_toc %}
-              {{ global_toc }}
-         {% else %}
-              <!-- Local TOC -->
-              <div class="local-toc">{{ toc }}</div>
-         {% endif %}
-{% endblock %}
-<!-- END OF LOCAL OVERRIDE -->
-
-{% block footer %}
-{{ super() }}
-<script script type="text/javascript">
-  var collapsedSections = ['Introduction', 'Getting Started', 'Working with LLMs', 'Exporting to ExecuTorch',  'API Reference', 'IR Specification', 'Compiler Entry Points', 'Runtime', 'Quantization', 'Kernel Library', 'Native Delegates', 'Backend Delegates', 'SDK', 'Tutorials']
-</script>
-
-{{ super() }}
-<script type="text/javascript">
-// Handle the right navigation in third level pages. Without this
-// in third level, only the last item always selected. This is a hacky
-// way and we should revise it eventually.
-// #side-scroll-highlight is disabled in .css.
-// Get all menu items
-var menuItems = document.querySelectorAll('.pytorch-right-menu a.reference.internal');
-// Add a click event listener to each menu item
-for (var i = 0; i < menuItems.length; i++) {
-  menuItems[i].addEventListener('click', function(event) {
-    // Remove the 'side-scroll-highlight-local' class from all menu items
-    for (var j = 0; j < menuItems.length; j++) {
-      menuItems[j].classList.remove('side-scroll-highlight-local');
-    }
-    // Add the 'side-scroll-highlight-local' class to the clicked item
-    event.target.classList.add('side-scroll-highlight-local');
-  });
-}
-</script>
-
-{{ super() }}
-<script type="text/javascript">
-  $(document).ready(function () {
-    // Patch links on interactive tutorial pages to point
-    // to the correct ExecuTorch URLs.
-    var downloadNote = $(".sphx-glr-download-link-note.admonition.note");
-    if (downloadNote.length >= 1) {
-      var tutorialUrl = $("#tutorial-type").text().substring($("#tutorial-type").text().indexOf("tutorials/") + 9); // 9 is the length of "tutorials/"
-      var githubLink = "https://github.com/pytorch/executorch/blob/main/docs/source/tutorials_source" + tutorialUrl + ".py",
-        notebookLink = $(".reference.download")[1].href,
-        notebookDownloadPath = notebookLink.split('_downloads')[1],
-        colabLink = "https://colab.research.google.com/github/pytorch/executorch/blob/gh-pages/main/_downloads" + notebookDownloadPath;
-
-      $(".pytorch-call-to-action-links a[data-response='Run in Google Colab']").attr("href", colabLink);
-      $(".pytorch-call-to-action-links a[data-response='View on Github']").attr("href", githubLink);
-    }
-
-    // Patch the "GitHub" link at the top of the page
-    // to point to the ExecuTorch repo.
-    var overwrite = function (_) {
-      if ($(this).length > 0) {
-        $(this)[0].href = "https://github.com/pytorch/executorch"
-      }
-    }
-    // PC
-    $(".main-menu a:contains('GitHub')").each(overwrite);
-    // Overwrite link to Tutorials and Get Started top navigation. If these sections are moved
-    // this overrides need to be updated.
-    $(".main-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/main/index#tutorials-and-examples");
-    $(".main-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/main/getting-started-setup");
-    // Mobile
-    $(".mobile-menu a:contains('Github')").each(overwrite);
-    // Overwrite link to Tutorials and Get Started top navigation. If these sections are moved
-    // this overrides need to be updated.
-    $(".mobile-menu a:contains('Tutorials')").attr("href", "https://pytorch.org/executorch/main/index#tutorials-and-examples");
-    $(".mobile-menu a:contains('Get Started')").attr("href", "https://pytorch.org/executorch/main/getting-started-setup");
-
-  });
-</script>
-{% endblock %}
diff --git a/docs/source/advanced-topics-section.md b/docs/source/advanced-topics-section.md
new file mode 100644
index 00000000000..e7b7f5490c6
--- /dev/null
+++ b/docs/source/advanced-topics-section.md
@@ -0,0 +1,112 @@
+(advanced-topics-section)=
+
+# Advanced
+
+Deep dive into ExecuTorch's advanced features for optimization, customization, and integration.
+
+This section covers advanced concepts for developers who need to customize ExecuTorch for specific use cases, optimize performance, or integrate with custom hardware backends.
+
+## Quantization & Optimization
+
+Techniques for model compression and performance optimization.
+
+**→ {doc}`quantization-optimization` — Quantization strategies and performance optimization**
+
+Key topics:
+
+- Quantization strategies and techniques
+- Performance profiling and optimization
+
+## Model Export
+
+Learn the core ExecuTorch workflow, exporting PyTorch models to the `.pte` format for edge deployment.
+
+**→ {doc}`using-executorch-export`** - Model Export & Lowering
+
+Key topics:
+
+- Export and Lowering Workflow
+- Hardware Backend Selection & Optimization
+- Dynamic Shapes & Advanced Model Features
+
+
+## Kernel Library
+
+Deep dive into ExecuTorch's kernel implementation and customization.
+
+**→ {doc}`kernel-library-advanced` — Kernel library deep dive and customization**
+
+Key topics:
+
+- Kernel library architecture
+- Custom kernel implementation
+- Selective build and optimization
+
+## Backend & Delegates
+
+**→ {doc}`backend-delegate-advanced` — Backend delegate integration**
+
+Key topics:
+
+- Learn how to integrate Backend Delegate into ExecuTorch and more
+- XNNPACK Delegate Internals
+- Debugging Delegation
+
+
+## Runtime & Integration
+
+Advanced runtime features and backend integration.
+
+**→ {doc}`runtime-integration-advanced` — Runtime customization and backend integration**
+
+Key topics:
+
+- Backend delegate implementation
+- Platform abstraction layer
+- Custom runtime integration
+
+## Compiler & IR
+
+Advanced compiler features and intermediate representation details.
+
+**→ {doc}`compiler-ir-advanced` — Compiler passes and IR specification**
+
+Key topics:
+
+- Custom compiler passes
+- Memory planning strategies
+- Backend dialect and EXIR
+- Ops set definition
+
+
+## File Formats
+
+ExecuTorch file format specifications and internals.
+
+**→ {doc}`file-formats-advanced` — PTE and PTD file format specifications**
+
+Key topics:
+
+- PTE file format internals
+- PTD file format specification
+- Custom file format handling
+
+## Next Steps
+
+After exploring advanced topics:
+
+- **{doc}`tools-sdk-section`** - Developer tools for debugging and profiling
+- **{doc}`api-section`** - Complete API reference documentation
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Advanced Topics
+
+quantization-optimization
+using-executorch-export
+kernel-library-advanced
+backend-delegate-advanced
+runtime-integration-advanced
+compiler-ir-advanced
+file-formats-advanced
diff --git a/docs/source/android-arm-vgf.md b/docs/source/android-arm-vgf.md
new file mode 100644
index 00000000000..cc39b53e176
--- /dev/null
+++ b/docs/source/android-arm-vgf.md
@@ -0,0 +1 @@
+```{include} backends-arm-vgf.md
diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md
new file mode 100644
index 00000000000..d506813990b
--- /dev/null
+++ b/docs/source/android-backends.md
@@ -0,0 +1,28 @@
+(android-backends)=
+# Backends
+
+Available hardware acceleration backends for Android deployment.
+
+## CPU Acceleration
+
+- {doc}`android-xnnpack` — XNNPACK CPU acceleration
+
+## GPU Acceleration
+
+- {doc}`android-vulkan` — Vulkan GPU acceleration
+
+## NPU/Accelerator Backends
+
+- {doc}`android-qualcomm` — Qualcomm AI Engine (NPU)
+- {doc}`android-mediatek` — MediaTek NPU acceleration
+- {doc}`android-arm-vgf` — ARM VGF Backend
+- {doc}`android-samsung-exynos` — Samsung Exynos NPU
+
+```{toctree}
+:hidden:
+android-xnnpack
+android-vulkan
+android-qualcomm
+android-mediatek
+android-arm-vgf
+android-samsung-exynos
diff --git a/docs/source/android-examples.md b/docs/source/android-examples.md
new file mode 100644
index 00000000000..65580870c57
--- /dev/null
+++ b/docs/source/android-examples.md
@@ -0,0 +1,9 @@
+# Examples & Demos
+
+- [Working with LLMs - Android Examples](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
+- [Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
+- {doc}`tutorial-arm-vgf` — Export a simple PyTorch model for the ExecuTorch VGF backend
+
+```{toctree}
+:hidden:
+tutorial-arm-vgf
diff --git a/docs/source/android-mediatek.md b/docs/source/android-mediatek.md
new file mode 100644
index 00000000000..7034fe439dd
--- /dev/null
+++ b/docs/source/android-mediatek.md
@@ -0,0 +1 @@
+```{include} backends-mediatek.md
diff --git a/docs/source/android-qualcomm.md b/docs/source/android-qualcomm.md
new file mode 100644
index 00000000000..f484d771a8b
--- /dev/null
+++ b/docs/source/android-qualcomm.md
@@ -0,0 +1 @@
+```{include} backends-qualcomm.md
diff --git a/docs/source/android-samsung-exynos.md b/docs/source/android-samsung-exynos.md
new file mode 100644
index 00000000000..4c5a470edca
--- /dev/null
+++ b/docs/source/android-samsung-exynos.md
@@ -0,0 +1 @@
+```{include} backends-samsung-exynos.md
diff --git a/docs/source/android-section.md b/docs/source/android-section.md
new file mode 100644
index 00000000000..a5774352bc1
--- /dev/null
+++ b/docs/source/android-section.md
@@ -0,0 +1,23 @@
+(android-section)=
+
+# Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-android` — Complete Android integration guide
+
+## Backends
+
+- {doc}`android-backends` — Available Android backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`android-examples` — Explore Android Examples & Demos
+
+```{toctree}
+:hidden:
+using-executorch-android
+android-backends
+android-examples
diff --git a/docs/source/android-vulkan.md b/docs/source/android-vulkan.md
new file mode 100644
index 00000000000..6399ac4ec7c
--- /dev/null
+++ b/docs/source/android-vulkan.md
@@ -0,0 +1 @@
+```{include} backends-vulkan.md
diff --git a/docs/source/android-xnnpack.md b/docs/source/android-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/android-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/api-section.md b/docs/source/api-section.md
new file mode 100644
index 00000000000..ab2573aefa9
--- /dev/null
+++ b/docs/source/api-section.md
@@ -0,0 +1,26 @@
+(api-section)=
+# API
+
+In this section, find complete API documentation for ExecuTorch's export, runtime, and extension interfaces. Includes comprehensive references for Python, C++, and Java APIs across all supported platforms.
+
+- {doc}`export-to-executorch-api-reference` — Export to ExecuTorch API Reference
+- {doc}`executorch-runtime-api-reference` — ExecuTorch Runtime API Reference
+- {doc}`runtime-python-api-reference` — Runtime Python API Reference
+- {doc}`api-life-cycle` — API Life Cycle
+- [Android doc →](https://pytorch.org/executorch/main/javadoc/) — Android API Documentation
+- {doc}`extension-module` — Extension Module
+- {doc}`extension-tensor` — Extension Tensor
+- {doc}`running-a-model-cpp-tutorial` — Detailed C++ Runtime APIs Tutorial
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: API Reference
+
+export-to-executorch-api-reference
+executorch-runtime-api-reference
+runtime-python-api-reference
+api-life-cycle
+extension-module
+extension-tensor
+running-a-model-cpp-tutorial
diff --git a/docs/source/backend-delegate-advanced.md b/docs/source/backend-delegate-advanced.md
new file mode 100644
index 00000000000..752bd1cdc02
--- /dev/null
+++ b/docs/source/backend-delegate-advanced.md
@@ -0,0 +1,33 @@
+(backend-delegate-advanced)=
+
+# Backend & Delegates
+
+## Integration
+
+- {doc}`backend-delegates-integration` — Learn how to integrate a backend delegate into ExecuTorch
+
+## XNNPACK Reference
+
+- {doc}`backend-delegates-xnnpack-reference` — Deep dive into XNNPACK delegate internals and implementation details
+
+## Dependency Management
+
+- {doc}`backend-delegates-dependencies` — Manage third-party dependencies for backend delegates
+
+## Overview
+
+- {doc}`compiler-delegate-and-partitioner` — Understanding backends, delegates, and the partitioner system
+
+## Debugging
+
+- {doc}`debug-backend-delegate` — Tools and techniques for debugging delegation issues
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+backend-delegates-integration
+backend-delegates-xnnpack-reference
+backend-delegates-dependencies
+compiler-delegate-and-partitioner
+debug-backend-delegate
diff --git a/docs/source/backend-delegates-dependencies.md b/docs/source/backend-delegates-dependencies.md
index f2068989bd2..06f23ca36bc 100644
--- a/docs/source/backend-delegates-dependencies.md
+++ b/docs/source/backend-delegates-dependencies.md
@@ -49,7 +49,7 @@ for these third-party dependencies.
   `executorch/third-party` then try to use that if possible. This
   helps with reducing the binary size when the delegate is enabled.
 * The rest of the ExecuTorch code, outside of the delegate, should not depend on
-  this. And it should should build and run correctly without this dependency
+  this. And it should build and run correctly without this dependency
   when the delegate is disabled at build time.
 
 More details in the section [below](#runtime-dependencies).
diff --git a/docs/source/backend-delegates-integration.md b/docs/source/backend-delegates-integration.md
index 0179ceff872..130da0d3225 100644
--- a/docs/source/backend-delegates-integration.md
+++ b/docs/source/backend-delegates-integration.md
@@ -23,12 +23,13 @@ the top level ExecuTorch package. For third-party dependencies, please refer to
 At a minimum, a delegate must provide CMake support for building its C++
 sources.
 
-For the CMake setup, the delegate dir should be included by the
-top level `CMakeLists.txt` file using `add_subdirectory` CMake command, and
-should be built conditionally with an ExecuTorch build flag like
-`EXECUTORCH_BUILD_<DELEGATE_NAME>`, see `EXECUTORCH_BUILD_XNNPACK` for example.
-For third-party dependencies, please refer to
-[this](backend-delegates-dependencies.md).
+For the CMake setup:
+
+- The delegate directory should be included by the top-level `CMakeLists.txt` file using the `add_subdirectory` command.
+- It should be built conditionally using an ExecuTorch build flag like `EXECUTORCH_BUILD_<DELEGATE_NAME>`.
+(See `EXECUTORCH_BUILD_XNNPACK` for an example.)
+
+For third-party dependencies, please refer to [this](backend-delegates-dependencies.md).
 
 <!---
 TODO: Add more details. Need to insert a CMake layer in `executorch/backends` to
@@ -49,9 +50,7 @@ Common test types:
 
 ## Documentation
 
-A delegate must contain a `executorch/backends/<delegate_name>/README.md`
-explaining the basics of the delegate, directory structure, features, and known
-issues if any.
+A delegate must include:
 
-Any extra setup steps beyond the ones listed above should be documented in
-`executorch/backends/<delegate_name>/setup.md`
+- `executorch/backends/<delegate_name>/README.md` – covering the basics of the delegate, its directory structure, features, and any known issues.
+- `executorch/backends/<delegate_name>/setup.md` – documenting any additional setup steps beyond the ones listed above.
diff --git a/docs/source/backend-delegates-xnnpack-reference.md b/docs/source/backend-delegates-xnnpack-reference.md
index cfb915aca59..8b4338e703c 100644
--- a/docs/source/backend-delegates-xnnpack-reference.md
+++ b/docs/source/backend-delegates-xnnpack-reference.md
@@ -70,7 +70,7 @@ Since weight packing creates an extra copy of the weights inside XNNPACK, We fre
 When executing the XNNPACK subgraphs, we prepare the tensor inputs and outputs and feed them to the XNNPACK runtime graph. After executing the runtime graph, the output pointers are filled with the computed tensors.
 
 #### **Profiling**
-We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
+We have enabled basic profiling for the XNNPACK delegate that can be enabled with the compiler flag `-DEXECUTORCH_ENABLE_EVENT_TRACER` (add `-DENABLE_XNNPACK_PROFILING` for additional details). With ExecuTorch's Developer Tools integration, you can also now use the Developer Tools to profile the model. You can follow the steps in [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> on how to profile ExecuTorch models and use Developer Tools' Inspector API to view XNNPACK's internal profiling information. An example implementation is available in the `executor_runner` (see [tutorial here](tutorial-xnnpack-delegate-lowering.md#profiling)).
 
 
 [comment]: <> (TODO: Refactor quantizer to a more official quantization doc)
diff --git a/docs/source/backend-development.md b/docs/source/backend-development.md
new file mode 100644
index 00000000000..ec5ceb3b37a
--- /dev/null
+++ b/docs/source/backend-development.md
@@ -0,0 +1,11 @@
+# Backend Development
+
+```{toctree}
+:maxdepth: 1
+
+backend-delegates-integration
+backend-delegates-xnnpack-reference
+backend-delegates-dependencies
+compiler-delegate-and-partitioner
+debug-backend-delegate
+```
diff --git a/docs/source/backends-arm-ethos-u.md b/docs/source/backends-arm-ethos-u.md
index 9b3d02b21c1..2dfddacd20f 100644
--- a/docs/source/backends-arm-ethos-u.md
+++ b/docs/source/backends-arm-ethos-u.md
@@ -1,7 +1,7 @@
 # Arm&reg; Ethos&trade;-U NPU Backend
 
 The Arm&reg; Ethos&trade;-U backend targets Edge/IoT-type AI use-cases by enabling optimal execution of quantized models on
-[Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
+[Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
 [Arm&reg; Ethos&trade;-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85), leveraging [TOSA](https://www.mlplatform.org/tosa/) and the
 [ethos-u-vela](https://pypi.org/project/ethos-u-vela/) graph compiler. This document is a technical reference for using the Ethos-U backend, for a top level view with code examples
 please refer to the [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
@@ -268,10 +268,18 @@ You can see how  this coupling between the memory mode and runtime application i
 
 The arm_executor_runner supports [bundled-io](https://docs.pytorch.org/executorch/0.4/bundled-io.html) and [ETdump](https://docs.pytorch.org/executorch/stable/etdump.html) debugging tools.
 
-To enable bundled-io, set `EXECUTORCH_BUILD_DEVTOOLS` when building Executorch and `DET_BUNDLE_IO` when building the executor_runner. Currently using bundled-io requires specifying your
-non delegated Aten ops manually by setting `EXECUTORCH_SELECT_OPS_LIST`. To enable ETdump, set `EXECUTORCH_BUILD_ARM_ETDUMP` when building Executorch and `DEXECUTORCH_ENABLE_EVENT_TRACER`
+To enable bundled-io, set `EXECUTORCH_BUILD_DEVTOOLS` when building Executorch and `DET_BUNDLE_IO` when building the executor_runner. To enable ETdump, set `EXECUTORCH_BUILD_ARM_ETDUMP` when building Executorch and `DEXECUTORCH_ENABLE_EVENT_TRACER`
 when building the executor_runner.
 
 
+## Memory formats
+
+Tensors of rank 4 and higher have two differing [memory format](https://pytorch.org/blog/tensor-memory-format-matters/) standards used.
+Pytorch defaults to contiguous/ channels first/ NCHW memory formats, compared to TOSA which only supports channels last/NHWC memory format.
+To support this, the backend inserts a transpose in the beginning if the incoming memory format is contiguous, and correspondingly a
+transpose in the end if the outgoing memory format is contiguous. Note that this means that you may avoid transposing the data unneccessarily if the runtime integration and
+full network is converted to use channels last. A word of caution must be given here however - changing memory format has been noted to have side effects such as
+unsupported ops being inserted into the graph, and it is currently not widely tested, so the feature must so far be viewed as experimental.
+
 ## See Also
-- [Arm Ethos-U Backend Tutorial](tutorial-arm.md)
\ No newline at end of file
+- [Arm Ethos-U Backend Tutorial](tutorial-arm-ethos-u.md)
\ No newline at end of file
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
index fe6748617a0..3ab0d3d3435 100644
--- a/docs/source/backends-coreml.md
+++ b/docs/source/backends-coreml.md
@@ -61,7 +61,7 @@ The Core ML partitioner API allows for configuration of the model delegation to
  - `skip_ops_for_coreml_delegation`: Allows you to skip ops for delegation by Core ML.  By default, all ops that Core ML supports will be delegated.  See [here](https://github.com/pytorch/executorch/blob/14ff52ff89a89c074fc6c14d3f01683677783dcd/backends/apple/coreml/test/test_coreml_partitioner.py#L42) for an example of skipping an op for delegation.
 - `compile_specs`: A list of `CompileSpec`s for the Core ML backend.  These control low-level details of Core ML delegation, such as the compute unit (CPU, GPU, ANE), the iOS deployment target, and the compute precision (FP16, FP32).  These are discussed more below.
 - `take_over_mutable_buffer`: A boolean that indicates whether PyTorch mutable buffers in stateful models should be converted to [Core ML `MLState`](https://developer.apple.com/documentation/coreml/mlstate).  If set to `False`, mutable buffers in the PyTorch graph are converted to graph inputs and outputs to the Core ML lowered module under the hood.  Generally, setting `take_over_mutable_buffer` to true will result in better performance, but using `MLState` requires iOS >= 18.0, macOS >= 15.0, and Xcode >= 16.0.
-- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By deafault, take_over_constant_data=True.
+- `take_over_constant_data`: A boolean that indicates whether PyTorch constant data like model weights should be consumed by the Core ML delegate.  If set to False, constant data is passed to the Core ML delegate as inputs.  By default, take_over_constant_data=True.
 - `lower_full_graph`: A boolean that indicates whether the entire graph must be lowered to Core ML.  If set to True and Core ML does not support an op, an error is raised during lowering.  If set to False and Core ML does not support an op, the op is executed on the CPU by ExecuTorch.  Although setting `lower_full_graph`=False can allow a model to lower where it would otherwise fail, it can introduce performance overhead in the model when there are unsupported ops.  You will see warnings about unsupported ops during lowering if there are any.  By default, `lower_full_graph`=False.
 
 
@@ -187,7 +187,7 @@ To quantize a PyTorch model for the Core ML backend, use the `CoreMLQuantizer`.
 Quantization with the Core ML backend requires exporting the model for iOS 17 or later.
 To perform 8-bit quantization with the PT2E flow, follow these steps:
 
-1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use to to create an instance of a `CoreMLQuantizer`.
+1) Create a [`coremltools.optimize.torch.quantization.LinearQuantizerConfig`](https://apple.github.io/coremltools/source/coremltools.optimize.torch.quantization.html#coremltools.optimize.torch.quantization.LinearQuantizerConfig) and use it to create an instance of a `CoreMLQuantizer`.
 2) Use `torch.export.export` to export a graph module that will be prepared for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
 4) Run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
@@ -386,4 +386,4 @@ If you're using Python 3.13, try reducing your python version to Python 3.12.  c
 ### At runtime
 1. [ETCoreMLModelCompiler.mm:55] [Core ML]  Failed to compile model, error = Error Domain=com.apple.mlassetio Code=1 "Failed to parse the model specification. Error: Unable to parse ML Program: at unknown location: Unknown opset 'CoreML7'." UserInfo={NSLocalizedDescription=Failed to par$
 
-This means the model requires the the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14.
+This means the model requires the Core ML opset 'CoreML7', which requires running the model on iOS >= 17 or macOS >= 14.
diff --git a/docs/source/backends-mediatek.md b/docs/source/backends-mediatek.md
index a562cea13bd..34cd56f971b 100644
--- a/docs/source/backends-mediatek.md
+++ b/docs/source/backends-mediatek.md
@@ -23,7 +23,7 @@ The MediaTek backend enables acceleration of PyTorch models on edge devices with
   ```
 - NeuroPilot SDK Python wheels (download from [NeuroPilot Express SDK](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress)):
   ```bash
-  pip3 install mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
+  pip3 install mtk_neuron-8.2.23-py3-none-linux_x86_64.whl
   pip3 install mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
   ```
 
diff --git a/docs/source/backends-nxp.md b/docs/source/backends-nxp.md
index f02f495f685..f4f7762c769 100644
--- a/docs/source/backends-nxp.md
+++ b/docs/source/backends-nxp.md
@@ -1,5 +1,79 @@
 # NXP eIQ Neutron Backend
 
-See
-[NXP eIQ Neutron Backend](https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md)
-for current status about running ExecuTorch on NXP eIQ Neutron Backend.
+This manual page is dedicated to introduction of using the ExecuTorch with NXP eIQ Neutron Backend.
+NXP offers accelerated machine learning models inference on edge devices.
+To learn more about NXP's machine learning acceleration platform, please refer to [the official NXP website](https://www.nxp.com/applications/technologies/ai-and-machine-learning:MACHINE-LEARNING).
+
+<div class="admonition tip">
+For up-to-date status about running ExecuTorch on Neutron Backend please visit the <a href="https://github.com/pytorch/executorch/blob/main/backends/nxp/README.md">manual page</a>.
+</div>
+
+## Features
+
+ExecuTorch v1.0 supports running machine learning models on selected NXP chips (for now only i.MXRT700).
+Among currently supported machine learning models are:
+- Convolution-based neutral networks
+- Full support for MobileNetV2 and CifarNet
+
+## Prerequisites (Hardware and Software)
+
+In order to successfully build ExecuTorch project and convert models for NXP eIQ Neutron Backend you will need a computer running Linux.
+
+If you want to test the runtime, you'll also need:
+- Hardware with NXP's [i.MXRT700](https://www.nxp.com/products/i.MX-RT700) chip or a testing board like MIMXRT700-AVK
+- [MCUXpresso IDE](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-integrated-development-environment-ide:MCUXpresso-IDE) or [MCUXpresso Visual Studio Code extension](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-for-visual-studio-code:MCUXPRESSO-VSC)
+
+## Using NXP backend
+
+To test converting a neural network model for inference on NXP eIQ Neutron Backend, you can use our example script:
+
+```shell
+# cd to the root of executorch repository
+./examples/nxp/aot_neutron_compile.sh [model (cifar10 or mobilenetv2)]
+```
+
+For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+
+### Partitioner API
+
+The partitioner is defined in `NeutronPartitioner` in `backends/nxp/neutron_partitioner.py`. It has the following
+arguments:
+* `compile_spec` - list of key-value pairs defining compilation. E.g. for specifying platform (i.MXRT700) and Neutron Converter flavor.
+* `custom_delegation_options` - custom options for specifying node delegation.
+
+### Quantization
+
+The quantization for Neutron Backend is defined in `NeutronQuantizer` in `backends/nxp/quantizer/neutron_quantizer.py`.
+The quantization follows PT2E workflow, INT8 quantization is supported. Operators are quantized statically, activations
+follow affine and weights symmetric per-tensor quantization scheme.
+
+#### Supported operators
+
+List of Aten operators supported by Neutron quantizer:
+
+`abs`, `adaptive_avg_pool2d`, `addmm`, `add.Tensor`, `avg_pool2d`, `cat`, `conv1d`, `conv2d`, `dropout`,
+`flatten.using_ints`, `hardtanh`, `hardtanh_`, `linear`, `max_pool2d`, `mean.dim`, `pad`, `permute`, `relu`, `relu_`,
+`reshape`, `view`, `softmax.int`, `sigmoid`, `tanh`, `tanh_`
+
+#### Example
+```python
+import torch
+from executorch.backends.nxp.quantizer.neutron_quantizer import NeutronQuantizer
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+# Prepare your model in Aten dialect
+aten_model = get_model_in_aten_dialect()
+# Prepare calibration inputs, each tuple is one example, example tuple has items for each model input
+calibration_inputs: list[tuple[torch.Tensor, ...]] = get_calibration_inputs()
+quantizer = NeutronQuantizer()
+
+m = prepare_pt2e(aten_model, quantizer)
+for data in calibration_inputs:
+    m(*data)
+m = convert_pt2e(m)
+```
+
+## Runtime Integration
+
+To learn how to run the converted model on the NXP hardware, use one of our example projects on using ExecuTorch runtime from MCUXpresso IDE example projects list.
+For more finegrained tutorial, visit [this manual page](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html).
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
index c83ace26853..4a3313964a8 100644
--- a/docs/source/backends-overview.md
+++ b/docs/source/backends-overview.md
@@ -1,21 +1,64 @@
-# Backend Overview
+# Backends
 
-ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
+## Backend Overview
 
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+ExecuTorch backends provide hardware acceleration for specific hardware targets, enabling models to run efficiently on devices ranging from mobile phones to embedded systems and DSPs. During the export and lowering process, ExecuTorch optimizes your model for the chosen backend, resulting in a `.pte` file specialized for that hardware. To support multiple platforms (e.g., Core ML on iOS, Arm CPU on Android), you typically generate a dedicated `.pte` file for each backend.
 
-As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
+The choice of backend is informed by the hardware your model will run on. Each backend has its own hardware requirements and level of model/operator support. See the documentation for each backend for details.
 
-### Available Backends
+As part of `.pte` file creation, ExecuTorch identifies model partitions supported by the backend. These are processed ahead of time for efficient execution. Operators not supported by the delegate are executed using the portable CPU fallback (e.g., XNNPACK), allowing for partial acceleration. You can also specify multiple partitioners in order of priority, so unsupported GPU ops can fall back to CPU, for example.
 
-Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation for more information.
+---
 
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
-- [Core ML (iOS)](backends-coreml.md)
-- [Metal Performance Shaders (iOS GPU)](backends-mps.md)
-- [Vulkan (Android GPU)](backends-vulkan.md)
-- [Qualcomm NPU](backends-qualcomm.md)
-- [MediaTek NPU](backends-mediatek.md)
-- [ARM Ethos-U NPU](backends-arm-ethos-u.md)
-- [ARM VGF](backends-arm-vgf.md)
-- [Cadence DSP](backends-cadence.md)
+## Why Backends Matter
+
+Backends are the bridge between your exported model and the hardware it runs on. Choosing the right backend ensures your model takes full advantage of device-specific acceleration, balancing performance, compatibility, and resource usage.
+
+---
+
+## Choosing a Backend
+
+| Backend                                  | Platform(s)         | Hardware Type | Typical Use Case                |
+|------------------------------------------|---------------------|---------------|---------------------------------|
+| [XNNPACK](backends-xnnpack)              | All                 | CPU           | General-purpose, fallback       |
+| [Core ML](backends-coreml)               | iOS, macOS          | NPU/GPU       | Apple devices, high performance |
+| [Metal Performance Shaders](backends-mps)| iOS, macOS          | GPU           | Apple GPU acceleration          |
+| [Vulkan ](backends-vulkan)               | Android             | GPU           | Android GPU acceleration        |
+| [Qualcomm](backends-qualcomm)            | Android             | NPU           | Qualcomm SoCs                   |
+| [MediaTek](backends-mediatek)            | Android             | NPU           | MediaTek SoCs                   |
+| [ARM EthosU](backends-arm-ethos-u)       | Embedded            | NPU           | ARM MCUs                        |
+| [ARM VGF](backends-arm-vgf)              | Android             | NPU           | ARM platforms                   |
+| [OpenVINO](build-run-openvino)           | Embedded            | CPU/GPU/NPU   | Intel  SoCs                     |
+| [NXP](backends-nxp)                      | Embedded            | NPU           | NXP SoCs                        |
+| [Cadence](backends-cadence)              | Embedded            | DSP           | DSP-optimized workloads         |
+| [Samsung Exynos](backends-samsung-exynos)| Android             | NPU           | Samsung SoCs                    |
+
+**Tip:** For best performance, export a `.pte` file for each backend you plan to support.
+
+---
+
+## Best Practices
+
+- **Test on all target devices:** Operator support may vary by backend.
+- **Use fallback wisely:** If a backend doesn't support an operator, ExecuTorch will run it on CPU.
+- **Consult backend docs:** Each backend has unique setup and tuning options.
+
+---
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Backend Overview
+
+backends-xnnpack
+backends-coreml
+backends-mps
+backends-vulkan
+backends-qualcomm
+backends-mediatek
+backends-arm-ethos-u
+backends-arm-vgf
+build-run-openvino
+backends-nxp
+backends-cadence
+backends-samsung-exynos
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 45f932da491..74089885fcf 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -74,10 +74,9 @@ This example is verified with SM8550 and SM8450.
  - A compiler to compile AOT parts, e.g., the GCC compiler comes with Ubuntu LTS.
  - [Android NDK](https://developer.android.com/ndk). This example is verified with NDK 26c.
  - [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk)
-   - Click the "Get Software" button to download a version of QNN SDK.
-   - However, at the moment of updating this tutorial, the above website doesn't provide QNN SDK newer than 2.22.6.
-   - The below is public links to download various QNN versions. Hope they can be publicly discoverable soon.
-   - [QNN 2.28.0](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip)
+   - Click the "Get Software" button to download the latest version of the QNN SDK.
+   - Although newer versions are available, we have verified and recommend using QNN 2.37.0 for stability.
+   - You can download it directly from the following link: [QNN 2.37.0](https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/2.37.0.250724/v2.37.0.250724.zip)
 
 The directory with installed Qualcomm AI Engine Direct SDK looks like:
 ```
@@ -136,86 +135,6 @@ cd $EXECUTORCH_ROOT
 ./backends/qualcomm/scripts/build.sh --release
 ```
 
-### AOT (Ahead-of-time) components:
-
-Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct binary.
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build-x86
-cd build-x86
-# Note that the below command might change.
-# Please refer to the above build.sh for latest workable commands.
-cmake .. \
-  -DCMAKE_INSTALL_PREFIX=$PWD \
-  -DEXECUTORCH_BUILD_QNN=ON \
-  -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-  -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-  -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-  -DPYTHON_EXECUTABLE=python3
-
-# nproc is used to detect the number of available CPU.
-# If it is not applicable, please feel free to use the number you want.
-cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
-
-# install Python APIs to correct import path
-# The filename might vary depending on your Python and host version.
-cp -f backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-cp -f backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so $EXECUTORCH_ROOT/backends/qualcomm/python
-
-# Workaround for .fbs files in exir/_serialize
-cp $EXECUTORCH_ROOT/schema/program.fbs $EXECUTORCH_ROOT/exir/_serialize/program.fbs
-cp $EXECUTORCH_ROOT/schema/scalar_type.fbs $EXECUTORCH_ROOT/exir/_serialize/scalar_type.fbs
-```
-
-### Runtime:
-
-An example `qnn_executor_runner` executable would be used to run the compiled `pte` model.
-
-Commands to build `qnn_executor_runner` for Android:
-
-```bash
-cd $EXECUTORCH_ROOT
-mkdir build-android
-cd build-android
-# build executorch & qnn_executorch_backend
-cmake .. \
-    -DCMAKE_INSTALL_PREFIX=$PWD \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=$QNN_SDK_ROOT \
-    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-    -DPYTHON_EXECUTABLE=python3 \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_PLATFORM=android-30
-
-# nproc is used to detect the number of available CPU.
-# If it is not applicable, please feel free to use the number you want.
-cmake --build $PWD --target install -j$(nproc)
-
-cmake ../examples/qualcomm \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_ROOT/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI='arm64-v8a' \
-    -DANDROID_PLATFORM=android-30 \
-    -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
-    -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
-    -DPYTHON_EXECUTABLE=python3 \
-    -Bexamples/qualcomm
-
-cmake --build examples/qualcomm -j$(nproc)
-
-# qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/executor_runner/qnn_executor_runner
-ls examples/qualcomm
-```
-
-**Note:** If you want to build for release, add `-DCMAKE_BUILD_TYPE=Release` to the `cmake` command options.
-
 
 ## Deploying and running on device
 
@@ -315,9 +234,11 @@ adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV79Stub.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
 adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
+adb push ${QNN_SDK_ROOT}/lib/hexagon-v79/unsigned/libQnnHtpV79Skel.so ${DEVICE_DIR}
 ```
 
 ***Step 2***.  We also need to indicate dynamic linkers on Android and Hexagon
@@ -363,13 +284,13 @@ The model, inputs, and output location are passed to `qnn_executorch_runner` by
 
 ## Supported model list
 
-Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
+Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `$EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
 
 ## How to Support a Custom Model in HTP Backend
 
 ### Step-by-Step Implementation Guide
 
-Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more compilated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
+Please reference [the simple example](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/scripts/export_example.py) and [more complicated examples](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/scripts) for reference
 #### Step 1: Prepare Your Model
 ```python
 import torch
@@ -476,4 +397,4 @@ print(f"Model successfully exported to {model_name}")
 ## FAQ
 
 If you encounter any issues while reproducing the tutorial, please file a github
-issue on ExecuTorch repo and tag use `#qcom_aisw` tag
+[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
diff --git a/docs/source/backends-samsung-exynos.md b/docs/source/backends-samsung-exynos.md
new file mode 100644
index 00000000000..0d77936bf7f
--- /dev/null
+++ b/docs/source/backends-samsung-exynos.md
@@ -0,0 +1 @@
+# Samsung Exynos Backend (TBD)
diff --git a/docs/source/backends-section.md b/docs/source/backends-section.md
new file mode 100644
index 00000000000..29a235a9416
--- /dev/null
+++ b/docs/source/backends-section.md
@@ -0,0 +1 @@
+```{include} backends-overview.md
diff --git a/docs/source/backends-vulkan.md b/docs/source/backends-vulkan.md
index 3ae80950645..531deece4e2 100644
--- a/docs/source/backends-vulkan.md
+++ b/docs/source/backends-vulkan.md
@@ -150,7 +150,7 @@ when building with CMake.
 
 First, make sure that you have the Android NDK installed; any NDK version past
 NDK r19c should work. Note that the examples in this doc have been validated with
-NDK r27b. The Android SDK should also be installed so that you have access to `adb`.
+NDK r28c. The Android SDK should also be installed so that you have access to `adb`.
 
 The instructions in this page assumes that the following environment variables
 are set.
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
index d1a120e69fa..42e76741ec8 100644
--- a/docs/source/backends-xnnpack.md
+++ b/docs/source/backends-xnnpack.md
@@ -67,10 +67,11 @@ The XNNPACK delegate can also be used as a backend to execute symmetrically quan
 
 ### Supported Quantization Schemes
 The XNNPACK delegate supports the following quantization schemes:
+
 - 8-bit symmetric weights with 8-bit asymmetric activations (via the PT2E quantization flow).
-    - Supports both static and dynamic activations.
-    - Supports per-channel and per-tensor schemes.
-    - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
+  - Supports both static and dynamic activations.
+  - Supports per-channel and per-tensor schemes.
+  - Supports linear, convolution, add, mul, cat, and adaptive avg pool 2d operators.
 
 Weight-only quantization is not currently supported on XNNPACK.
 
@@ -81,7 +82,7 @@ To perform 8-bit quantization with the PT2E flow, perform the following steps pr
 1) Create an instance of the `XnnpackQuantizer` class. Set quantization parameters.
 2) Use `torch.export.export` to prepare for quantization.
 3) Call `prepare_pt2e` to prepare the model for quantization.
-4) For static quantization, run the prepared model with representative samples to calibrate the quantizated tensor activation ranges.
+4) For static quantization, run the prepared model with representative samples to calibrate the quantized tensor activation ranges.
 5) Call `convert_pt2e` to quantize the model.
 6) Export and lower the model using the standard flow.
 
diff --git a/docs/source/build-run-openvino.md b/docs/source/build-run-openvino.md
index dc6f098850f..9b4c48fee5a 100644
--- a/docs/source/build-run-openvino.md
+++ b/docs/source/build-run-openvino.md
@@ -61,7 +61,7 @@ For more information about OpenVINO build, refer to the [OpenVINO Build Instruct
 
 Follow the steps below to setup your build environment:
 
-1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](getting-started-setup.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
+1. **Setup ExecuTorch Environment**: Refer to the [Environment Setup](using-executorch-building-from-source.md#environment-setup) guide for detailed instructions on setting up the ExecuTorch environment.
 
 2. **Setup OpenVINO Backend Environment**
 - Install the dependent libs. Ensure that you are inside `executorch/backends/openvino/` directory
@@ -92,7 +92,7 @@ The exported model will be saved as 'resnet50.pte' in the current directory.
 
 ### Build C++ OpenVINO Examples
 
-After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `<executorch_root>/cmake-out/backends/openvino/`.
+After building the OpenVINO backend following the [instructions](#setup) above, the executable will be saved in `<executorch_root>/cmake-out/`.
 
 The executable requires a model file (`.pte` file generated in the aot step) and the number of inference executions.
 
@@ -101,7 +101,7 @@ The executable requires a model file (`.pte` file generated in the aot step) and
 Run inference with a given model for 10 executions:
 
 ```
-./openvino_executor_runner \
+./executor_runner \
     --model_path=model.pte \
     --num_executions=10
 ```
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index 79897737268..c0b03938374 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -17,7 +17,7 @@ This stage mainly focuses on the creation of a `BundledProgram` and dumping it o
 
 ### Step 1: Create a Model and Emit its ExecuTorch Program.
 
-ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+ExecuTorch Program can be emitted from user's model by using ExecuTorch APIs. Follow the [Generate and emit sample ExecuTorch program](getting-started.md#exporting) or [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 
 ### Step 2: Construct `List[MethodTestSuite]` to hold test info
 
@@ -194,7 +194,7 @@ regenerate_bundled_program = deserialize_from_flatbuffer_to_bundled_program(seri
 ```
 
 ## Runtime Stage
-This stage mainly focuses on executing the model with the bundled inputs and and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
+This stage mainly focuses on executing the model with the bundled inputs and comparing the model's output with the bundled expected output. We provide multiple APIs to handle the key parts of it.
 
 
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
diff --git a/docs/source/compiler-delegate-and-partitioner.md b/docs/source/compiler-delegate-and-partitioner.md
index c633bb1fd12..b057f3afa2e 100644
--- a/docs/source/compiler-delegate-and-partitioner.md
+++ b/docs/source/compiler-delegate-and-partitioner.md
@@ -1,4 +1,4 @@
-# Backends and Delegates
+# Understanding Backends and Delegates
 
 Audience: Vendors, Backend Delegate developers, who are interested in integrating their own compilers and hardware as part of ExecuTorch
 
@@ -37,7 +37,7 @@ The diagram looks like following
 There are mainly two Ahead-of-Time entry point for backend to implement: `partition` and `preprocess`.
 
 `partitioner` is an algorithm implemented by the backend to tag the nodes to be lowered to the backend. `to_backend` API will apply the partition algorithm and lower each subgraph, which consists of connected tagged nodes, to the targeted backend. Every subgraph
-will be sent to the `preprocess` part provided by the backend to compiled as a binary blob.
+will be sent to the `preprocess` part provided by the backend to be compiled as a binary blob.
 
 During partition, the `exported_program` is not allowed to mutate the program, and it's supposed to apply tag to each node. The
 `PartitionResult` includes both tagged exported program and the partition tags dictionary for `to_backend` to look up the tag and
@@ -194,8 +194,8 @@ qnnpack is one backend and xnnpack is another backend. We haven't open-sourced
 these two backends delegates yet, and this example won't run out of box. It can
 be used as a reference to see how it can be done.
 
-This option is easy to try becuase usually all backends will implement their own
-parititioner. However this option may get different results if we change the
+This option is easy to try because usually all backends will implement their own
+partitioner. However this option may get different results if we change the
 order of to_backend call. If we want to have a better control on the nodes, like
 which backend they should go, option 2 is better.
 
diff --git a/docs/source/compiler-entry-points.md b/docs/source/compiler-entry-points.md
new file mode 100644
index 00000000000..ac5623c6769
--- /dev/null
+++ b/docs/source/compiler-entry-points.md
@@ -0,0 +1,9 @@
+# Compiler Entry Points
+
+```{toctree}
+:maxdepth: 1
+
+compiler-backend-dialect
+compiler-custom-compiler-passes
+compiler-memory-planning
+```
diff --git a/docs/source/compiler-ir-advanced.md b/docs/source/compiler-ir-advanced.md
new file mode 100644
index 00000000000..b6d24026d5a
--- /dev/null
+++ b/docs/source/compiler-ir-advanced.md
@@ -0,0 +1,31 @@
+(compiler-ir-advanced)=
+# Compiler & IR
+
+Advanced compiler features and intermediate representation specifications.
+
+## Compiler Passes
+
+- {doc}`compiler-custom-compiler-passes` — Custom compiler passes and optimization
+
+## Memory Management
+
+- {doc}`compiler-memory-planning` — Advanced memory planning strategies
+
+## Intermediate Representation
+
+- {doc}`ir-exir` — EXIR (Export Intermediate Representation) specification
+- {doc}`ir-ops-set-definition` — Ops set definition and operator standardization
+
+## Backend dialect
+
+- {doc}`compiler-backend-dialect` — Backend dialect and compiler integration
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+compiler-custom-compiler-passes
+compiler-memory-planning
+ir-exir
+ir-ops-set-definition
+compiler-backend-dialect
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 65845c03868..31abdef2820 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 import sys
 from typing import Any
 
-import pytorch_sphinx_theme
+import pytorch_sphinx_theme2  # type: ignore[import-not-found]
 
 # To let us import ./custom_directives.py
 sys.path.insert(0, os.path.abspath("."))
@@ -63,13 +63,10 @@
     "sphinx_design",
     "sphinx_gallery.gen_gallery",
     "sphinx_reredirects",
+    "sphinx_sitemap",
+    "sphinxcontrib.mermaid",
 ]
 
-if not FBCODE:
-    extensions += [
-        "executorch_custom_versions",
-    ]
-
 this_file_dir = os.path.abspath(os.path.dirname(__file__))
 doxygen_xml_dir = os.path.join(
     os.path.dirname(this_file_dir),  # {repo_root}/docs/
@@ -77,7 +74,7 @@
     "xml",  # {repo_root}/docs/cpp/build/xml
 )
 
-html_favicon = "_static/img/ExecuTorch-Logo-cropped.svg"
+html_favicon = "_static/img/executorch-chip-logo.svg"
 
 # Get ET_VERSION_DOCS during the build.
 et_version_docs = os.environ.get("ET_VERSION_DOCS", None)
@@ -99,14 +96,23 @@
 print(f"Version: {version}")
 html_title = " ".join((project, version, "documentation"))
 
+html_baseurl = "https://docs.pytorch.org/executorch/"  # needed for sphinx-sitemap
+sitemap_locales = [None]
+sitemap_excludes = [
+    "search.html",
+    "genindex.html",
+]
+sitemap_url_scheme = "{link}"
+
 breathe_projects = {"ExecuTorch": "../build/xml/"}
 breathe_default_project = "ExecuTorch"
 
-templates_path = ["_templates"]
 autodoc_typehints = "description"
 
 myst_enable_extensions = [
     "colon_fence",
+    "deflist",
+    "html_image",
 ]
 
 myst_heading_anchors = 4
@@ -162,23 +168,78 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = "pytorch_sphinx_theme"
-html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+html_theme = "pytorch_sphinx_theme2"
+html_theme_path = [pytorch_sphinx_theme2.get_html_theme_path()]
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
+
+switcher_version = version
+
 html_theme_options = {
+    "logo": {
+        "image_light": "_static/img/et-logo.png",
+        "image_dark": "_static/img/et-logo.png",
+    },
+    "navigation_with_keys": False,
+    "canonical_url": "https://docs.pytorch.org/executorch/stable/",
+    "switcher": {
+        "json_url": "https://docs.pytorch.org/executorch/executorch-versions.json",  # for testing only, will need to replace to the correct json file on the executorch website when it's added in the repo.
+        "version_match": switcher_version,
+    },
+    "show_toc_level": 2,
+    "analytics_id": "GTM-T8XT4PS",
+    "icon_links": [
+        {
+            "name": "X",
+            "url": "https://x.com/PyTorch",
+            "icon": "fa-brands fa-x-twitter",
+        },
+        {
+            "name": "GitHub",
+            "url": "https://github.com/pytorch/executorch",
+            "icon": "fa-brands fa-github",
+        },
+        {
+            "name": "Discourse",
+            "url": "https://discuss.pytorch.org/",
+            "icon": "fa-brands fa-discourse",
+        },
+        {
+            "name": "PyPi",
+            "url": "https://pypi.org/project/executorch",
+            "icon": "fa-brands fa-python",
+        },
+    ],
+    "show_version_warning_banner": True,
+    "use_edit_page_button": True,
+    "header_links_before_dropdown": 8,
+    "navbar_align": "left",
+    "navbar_start": ["navbar-logo", "version-switcher"],
+    "navbar_center": ["navbar-nav"],
+    "navbar_end": ["search-field-custom", "theme-switcher", "navbar-icon-links"],
+    "navbar_persistent": [],
+}
+
+theme_variables = pytorch_sphinx_theme2.get_theme_variables()
+templates_path = [
+    "_templates",
+    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
+]
+
+html_context = {
+    "theme_variables": theme_variables,
+    "display_github": True,
+    "github_url": "https://github.com",
+    "github_user": "pytorch",
+    "github_repo": "executorch",
+    "feedback_url": "https://github.com/pytorch/executorch",
+    "github_version": "main",
+    "doc_path": "docs/source",
     "pytorch_project": "executorch",
     "display_version": True,
-    "logo_only": True,
-    "collapse_navigation": True,  # changed to True to enable 3rd level nav.
-    "sticky_navigation": False,
-    "navigation_depth": 4,
-    "includehidden": True,
-    "titles_only": False,
-    "analytics_id": "GTM-T8XT4PS",
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -186,14 +247,15 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
-html_css_files = ["css/custom.css", "progress-bar.css"]
-html_js_files = ["js/progress-bar.js"]
+# Add custom 404 page for GitHub Pages
+html_additional_pages = {"404": "404.html"}
+
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
     "python": ("https://docs.python.org/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
-    "torch": ("https://pytorch.org/docs/stable/", None),
+    "torch": ("https://docs.pytorch.org/docs/stable/", None),
 }
 
 # Redirects for moved pages
diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md
index 86dddd75868..efb4653a994 100644
--- a/docs/source/debug-backend-delegate.md
+++ b/docs/source/debug-backend-delegate.md
@@ -6,60 +6,607 @@ We provide a list of util functions to give users insights on what happened to t
 The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
 
 ```python
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge_transform_and_lower
+from torch.export import Dim, export
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+import torchvision.models as models
+
+# Dependency needed for debugging delegates
 from executorch.devtools.backend_debug import get_delegation_info
 from tabulate import tabulate
 
-# ... After call to to_backend(), but before to_executorch()
-graph_module = edge_manager.exported_program().graph_module
+
+model = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).eval()
+sample_inputs = (torch.randn(1, 3, 224, 224), )
+
+et_program = to_edge_transform_and_lower(
+    torch.export.export(model, sample_inputs),
+    partitioner=[XnnpackPartitioner()]
+)
+graph_module = et_program.exported_program().graph_module
 delegation_info = get_delegation_info(graph_module)
+# print the summary like the number of delegated nodes, non-delegated nodes, etc
 print(delegation_info.get_summary())
 df = delegation_info.get_operator_delegation_dataframe()
+# print the table including op_type, occurrences_in_delegated_graphs, occurrences_in_non_delegated_graphs
 print(tabulate(df, headers="keys", tablefmt="fancy_grid"))
 ```
 
 Example printout:
 ```
-Total  delegated  subgraphs:  86
-Number  of  delegated  nodes:  473
-Number  of  non-delegated  nodes:  430
+Total delegated subgraphs: 2
+Number of delegated nodes: 203
+Number of non-delegated nodes: 4
 ```
 
+|    | op_type                                           | occurrences_in_delegated_graphs | occurrences_in_non_delegated_graphs |
+|----|---------------------------------------------------|---------------------------------|-------------------------------------|
+|  0 | aten__native_batch_norm_legit_no_training_default | 52                              | 0                                   |
+|  1 | aten_add_tensor                                   | 10                              | 0                                   |
+|  2 | aten_convolution_default                          | 52                              | 0                                   |
+|  3 | aten_hardtanh_default                             | 35                              | 0                                   |
+|  4 | aten_linear_default                               | 1                               | 0                                   |
+|  5 | aten_mean_dim                                     | 1                               | 0                                   |
+|  6 | aten_view_copy_default                            | 0                               | 1                                   |
+|  7 | dim_order_ops__clone_dim_order_default            | 0                               | 1                                   |
+|  8 | getitem                                           | 52                              | 2                                   |
+|  9 | **Total**                                         | **203**                         | **4**                               |
 
-|    |  op_type                                 |  occurrences_in_delegated_graphs  |  occurrences_in_non_delegated_graphs  |
-|----|---------------------------------|------- |-----|
-|  0  |  aten__softmax_default  |  12  |  0  |
-|  1  |  aten_add_tensor  |  37  |  0  |
-|  2  |  aten_addmm_default  |  48  |  0  |
-|  3  |  aten_arange_start_step  |  0  |  25  |
-|      |  ...  |    |    |
-|  23  |  aten_view_copy_default  |  170  |  48  |
-|      |  ...  |    |    |
-|  26  |  Total  |  473  |  430  |
 
-From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs. Users can use information like this to debug.
+From the table, the operator `aten_view_copy_default` appears 0 times in delegate graphs and 1 times in non-delegated graphs. Users can use information like this to debug. `get_item node` is a special case, it means getting the output from the delegate subgraph.
 
 ## Visualize delegated graph
-To see a more detailed view, use the `format_delegated_graph()` method to get a str of printout of the whole graph or use `print_delegated_graph()` to print directly:
+To see a more detailed view, use the `format_delegated_graph()` method to get a string representation of the entire graph or use `print_delegated_graph()` to print directly:
 
 ```python
 from executorch.exir.backend.utils import format_delegated_graph
-graph_module = edge_manager.exported_program().graph_module
+graph_module = et_program.exported_program().graph_module
 print(format_delegated_graph(graph_module)) # or call print_delegated_graph(graph_module)
 ```
-It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` but hide the the subgraph consumes by the backend, while this function exposes the contents inside the subgraph.
+It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` and hide the subgraph consumed by the backend, while this function exposes the contents inside the subgraph.
 
-In the example printout below, observe that `embedding` and `add` operators are delegated to `XNNPACK` while the `sub` operator is not.
+In the example printout below, observe that there are two subgraphs, `aten_view_copy_default` is not delegated, while most of the others ops are delegated.
 
+<details>
 ```
-%aten_unsqueeze_copy_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_23, -2), kwargs = {})
-  %aten_unsqueeze_copy_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.unsqueeze_copy.default](args = (%aten_arange_start_step_24, -1), kwargs = {})
+graph():
+  %b_features_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_0_1_num_batches_tracked]
+  %b_features_1_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_1_conv_0_1_num_batches_tracked]
+  %b_features_1_conv_2_num_batches_tracked : [num_users=0] = placeholder[target=b_features_1_conv_2_num_batches_tracked]
+  %b_features_2_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_2_conv_0_1_num_batches_tracked]
+  %b_features_2_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_2_conv_1_1_num_batches_tracked]
+  %b_features_2_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_2_conv_3_num_batches_tracked]
+  %b_features_3_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_3_conv_0_1_num_batches_tracked]
+  %b_features_3_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_3_conv_1_1_num_batches_tracked]
+  %b_features_3_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_3_conv_3_num_batches_tracked]
+  %b_features_4_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_4_conv_0_1_num_batches_tracked]
+  %b_features_4_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_4_conv_1_1_num_batches_tracked]
+  %b_features_4_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_4_conv_3_num_batches_tracked]
+  %b_features_5_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_5_conv_0_1_num_batches_tracked]
+  %b_features_5_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_5_conv_1_1_num_batches_tracked]
+  %b_features_5_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_5_conv_3_num_batches_tracked]
+  %b_features_6_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_6_conv_0_1_num_batches_tracked]
+  %b_features_6_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_6_conv_1_1_num_batches_tracked]
+  %b_features_6_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_6_conv_3_num_batches_tracked]
+  %b_features_7_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_7_conv_0_1_num_batches_tracked]
+  %b_features_7_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_7_conv_1_1_num_batches_tracked]
+  %b_features_7_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_7_conv_3_num_batches_tracked]
+  %b_features_8_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_8_conv_0_1_num_batches_tracked]
+  %b_features_8_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_8_conv_1_1_num_batches_tracked]
+  %b_features_8_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_8_conv_3_num_batches_tracked]
+  %b_features_9_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_9_conv_0_1_num_batches_tracked]
+  %b_features_9_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_9_conv_1_1_num_batches_tracked]
+  %b_features_9_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_9_conv_3_num_batches_tracked]
+  %b_features_10_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_10_conv_0_1_num_batches_tracked]
+  %b_features_10_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_10_conv_1_1_num_batches_tracked]
+  %b_features_10_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_10_conv_3_num_batches_tracked]
+  %b_features_11_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_11_conv_0_1_num_batches_tracked]
+  %b_features_11_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_11_conv_1_1_num_batches_tracked]
+  %b_features_11_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_11_conv_3_num_batches_tracked]
+  %b_features_12_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_12_conv_0_1_num_batches_tracked]
+  %b_features_12_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_12_conv_1_1_num_batches_tracked]
+  %b_features_12_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_12_conv_3_num_batches_tracked]
+  %b_features_13_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_13_conv_0_1_num_batches_tracked]
+  %b_features_13_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_13_conv_1_1_num_batches_tracked]
+  %b_features_13_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_13_conv_3_num_batches_tracked]
+  %b_features_14_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_14_conv_0_1_num_batches_tracked]
+  %b_features_14_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_14_conv_1_1_num_batches_tracked]
+  %b_features_14_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_14_conv_3_num_batches_tracked]
+  %b_features_15_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_15_conv_0_1_num_batches_tracked]
+  %b_features_15_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_15_conv_1_1_num_batches_tracked]
+  %b_features_15_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_15_conv_3_num_batches_tracked]
+  %b_features_16_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_16_conv_0_1_num_batches_tracked]
+  %b_features_16_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_16_conv_1_1_num_batches_tracked]
+  %b_features_16_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_16_conv_3_num_batches_tracked]
+  %b_features_17_conv_0_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_17_conv_0_1_num_batches_tracked]
+  %b_features_17_conv_1_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_17_conv_1_1_num_batches_tracked]
+  %b_features_17_conv_3_num_batches_tracked : [num_users=0] = placeholder[target=b_features_17_conv_3_num_batches_tracked]
+  %b_features_18_1_num_batches_tracked : [num_users=0] = placeholder[target=b_features_18_1_num_batches_tracked]
+  %x : [num_users=1] = placeholder[target=x]
   %lowered_module_0 : [num_users=1] = get_attr[target=lowered_module_0]
     backend_id: XnnpackBackend
     lowered graph():
-      %aten_embedding_default : [num_users=1] = placeholder[target=aten_embedding_default]
-      %aten_embedding_default_1 : [num_users=1] = placeholder[target=aten_embedding_default_1]
-      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_embedding_default, %aten_embedding_default_1), kwargs = {})
-      return (aten_add_tensor,)
-  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %aten_embedding_default, %aten_embedding_default_1), kwargs = {})
-  %aten_sub_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.sub.Tensor](args = (%aten_unsqueeze_copy_default, %aten_unsqueeze_copy_default_1), kwargs = {})
+      %p_features_0_0_weight : [num_users=1] = placeholder[target=p_features_0_0_weight]
+      %p_features_0_1_weight : [num_users=1] = placeholder[target=p_features_0_1_weight]
+      %p_features_0_1_bias : [num_users=1] = placeholder[target=p_features_0_1_bias]
+      %p_features_1_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_1_conv_0_0_weight]
+      %p_features_1_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_1_conv_0_1_weight]
+      %p_features_1_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_1_conv_0_1_bias]
+      %p_features_1_conv_1_weight : [num_users=1] = placeholder[target=p_features_1_conv_1_weight]
+      %p_features_1_conv_2_weight : [num_users=1] = placeholder[target=p_features_1_conv_2_weight]
+      %p_features_1_conv_2_bias : [num_users=1] = placeholder[target=p_features_1_conv_2_bias]
+      %p_features_2_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_2_conv_0_0_weight]
+      %p_features_2_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_2_conv_0_1_weight]
+      %p_features_2_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_2_conv_0_1_bias]
+      %p_features_2_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_2_conv_1_0_weight]
+      %p_features_2_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_2_conv_1_1_weight]
+      %p_features_2_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_2_conv_1_1_bias]
+      %p_features_2_conv_2_weight : [num_users=1] = placeholder[target=p_features_2_conv_2_weight]
+      %p_features_2_conv_3_weight : [num_users=1] = placeholder[target=p_features_2_conv_3_weight]
+      %p_features_2_conv_3_bias : [num_users=1] = placeholder[target=p_features_2_conv_3_bias]
+      %p_features_3_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_3_conv_0_0_weight]
+      %p_features_3_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_3_conv_0_1_weight]
+      %p_features_3_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_3_conv_0_1_bias]
+      %p_features_3_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_3_conv_1_0_weight]
+      %p_features_3_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_3_conv_1_1_weight]
+      %p_features_3_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_3_conv_1_1_bias]
+      %p_features_3_conv_2_weight : [num_users=1] = placeholder[target=p_features_3_conv_2_weight]
+      %p_features_3_conv_3_weight : [num_users=1] = placeholder[target=p_features_3_conv_3_weight]
+      %p_features_3_conv_3_bias : [num_users=1] = placeholder[target=p_features_3_conv_3_bias]
+      %p_features_4_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_4_conv_0_0_weight]
+      %p_features_4_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_4_conv_0_1_weight]
+      %p_features_4_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_4_conv_0_1_bias]
+      %p_features_4_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_4_conv_1_0_weight]
+      %p_features_4_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_4_conv_1_1_weight]
+      %p_features_4_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_4_conv_1_1_bias]
+      %p_features_4_conv_2_weight : [num_users=1] = placeholder[target=p_features_4_conv_2_weight]
+      %p_features_4_conv_3_weight : [num_users=1] = placeholder[target=p_features_4_conv_3_weight]
+      %p_features_4_conv_3_bias : [num_users=1] = placeholder[target=p_features_4_conv_3_bias]
+      %p_features_5_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_5_conv_0_0_weight]
+      %p_features_5_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_5_conv_0_1_weight]
+      %p_features_5_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_5_conv_0_1_bias]
+      %p_features_5_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_5_conv_1_0_weight]
+      %p_features_5_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_5_conv_1_1_weight]
+      %p_features_5_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_5_conv_1_1_bias]
+      %p_features_5_conv_2_weight : [num_users=1] = placeholder[target=p_features_5_conv_2_weight]
+      %p_features_5_conv_3_weight : [num_users=1] = placeholder[target=p_features_5_conv_3_weight]
+      %p_features_5_conv_3_bias : [num_users=1] = placeholder[target=p_features_5_conv_3_bias]
+      %p_features_6_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_6_conv_0_0_weight]
+      %p_features_6_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_6_conv_0_1_weight]
+      %p_features_6_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_6_conv_0_1_bias]
+      %p_features_6_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_6_conv_1_0_weight]
+      %p_features_6_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_6_conv_1_1_weight]
+      %p_features_6_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_6_conv_1_1_bias]
+      %p_features_6_conv_2_weight : [num_users=1] = placeholder[target=p_features_6_conv_2_weight]
+      %p_features_6_conv_3_weight : [num_users=1] = placeholder[target=p_features_6_conv_3_weight]
+      %p_features_6_conv_3_bias : [num_users=1] = placeholder[target=p_features_6_conv_3_bias]
+      %p_features_7_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_7_conv_0_0_weight]
+      %p_features_7_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_7_conv_0_1_weight]
+      %p_features_7_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_7_conv_0_1_bias]
+      %p_features_7_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_7_conv_1_0_weight]
+      %p_features_7_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_7_conv_1_1_weight]
+      %p_features_7_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_7_conv_1_1_bias]
+      %p_features_7_conv_2_weight : [num_users=1] = placeholder[target=p_features_7_conv_2_weight]
+      %p_features_7_conv_3_weight : [num_users=1] = placeholder[target=p_features_7_conv_3_weight]
+      %p_features_7_conv_3_bias : [num_users=1] = placeholder[target=p_features_7_conv_3_bias]
+      %p_features_8_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_8_conv_0_0_weight]
+      %p_features_8_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_8_conv_0_1_weight]
+      %p_features_8_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_8_conv_0_1_bias]
+      %p_features_8_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_8_conv_1_0_weight]
+      %p_features_8_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_8_conv_1_1_weight]
+      %p_features_8_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_8_conv_1_1_bias]
+      %p_features_8_conv_2_weight : [num_users=1] = placeholder[target=p_features_8_conv_2_weight]
+      %p_features_8_conv_3_weight : [num_users=1] = placeholder[target=p_features_8_conv_3_weight]
+      %p_features_8_conv_3_bias : [num_users=1] = placeholder[target=p_features_8_conv_3_bias]
+      %p_features_9_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_9_conv_0_0_weight]
+      %p_features_9_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_9_conv_0_1_weight]
+      %p_features_9_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_9_conv_0_1_bias]
+      %p_features_9_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_9_conv_1_0_weight]
+      %p_features_9_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_9_conv_1_1_weight]
+      %p_features_9_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_9_conv_1_1_bias]
+      %p_features_9_conv_2_weight : [num_users=1] = placeholder[target=p_features_9_conv_2_weight]
+      %p_features_9_conv_3_weight : [num_users=1] = placeholder[target=p_features_9_conv_3_weight]
+      %p_features_9_conv_3_bias : [num_users=1] = placeholder[target=p_features_9_conv_3_bias]
+      %p_features_10_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_10_conv_0_0_weight]
+      %p_features_10_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_10_conv_0_1_weight]
+      %p_features_10_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_10_conv_0_1_bias]
+      %p_features_10_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_10_conv_1_0_weight]
+      %p_features_10_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_10_conv_1_1_weight]
+      %p_features_10_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_10_conv_1_1_bias]
+      %p_features_10_conv_2_weight : [num_users=1] = placeholder[target=p_features_10_conv_2_weight]
+      %p_features_10_conv_3_weight : [num_users=1] = placeholder[target=p_features_10_conv_3_weight]
+      %p_features_10_conv_3_bias : [num_users=1] = placeholder[target=p_features_10_conv_3_bias]
+      %p_features_11_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_11_conv_0_0_weight]
+      %p_features_11_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_11_conv_0_1_weight]
+      %p_features_11_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_11_conv_0_1_bias]
+      %p_features_11_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_11_conv_1_0_weight]
+      %p_features_11_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_11_conv_1_1_weight]
+      %p_features_11_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_11_conv_1_1_bias]
+      %p_features_11_conv_2_weight : [num_users=1] = placeholder[target=p_features_11_conv_2_weight]
+      %p_features_11_conv_3_weight : [num_users=1] = placeholder[target=p_features_11_conv_3_weight]
+      %p_features_11_conv_3_bias : [num_users=1] = placeholder[target=p_features_11_conv_3_bias]
+      %p_features_12_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_12_conv_0_0_weight]
+      %p_features_12_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_12_conv_0_1_weight]
+      %p_features_12_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_12_conv_0_1_bias]
+      %p_features_12_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_12_conv_1_0_weight]
+      %p_features_12_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_12_conv_1_1_weight]
+      %p_features_12_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_12_conv_1_1_bias]
+      %p_features_12_conv_2_weight : [num_users=1] = placeholder[target=p_features_12_conv_2_weight]
+      %p_features_12_conv_3_weight : [num_users=1] = placeholder[target=p_features_12_conv_3_weight]
+      %p_features_12_conv_3_bias : [num_users=1] = placeholder[target=p_features_12_conv_3_bias]
+      %p_features_13_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_13_conv_0_0_weight]
+      %p_features_13_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_13_conv_0_1_weight]
+      %p_features_13_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_13_conv_0_1_bias]
+      %p_features_13_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_13_conv_1_0_weight]
+      %p_features_13_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_13_conv_1_1_weight]
+      %p_features_13_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_13_conv_1_1_bias]
+      %p_features_13_conv_2_weight : [num_users=1] = placeholder[target=p_features_13_conv_2_weight]
+      %p_features_13_conv_3_weight : [num_users=1] = placeholder[target=p_features_13_conv_3_weight]
+      %p_features_13_conv_3_bias : [num_users=1] = placeholder[target=p_features_13_conv_3_bias]
+      %p_features_14_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_14_conv_0_0_weight]
+      %p_features_14_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_14_conv_0_1_weight]
+      %p_features_14_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_14_conv_0_1_bias]
+      %p_features_14_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_14_conv_1_0_weight]
+      %p_features_14_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_14_conv_1_1_weight]
+      %p_features_14_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_14_conv_1_1_bias]
+      %p_features_14_conv_2_weight : [num_users=1] = placeholder[target=p_features_14_conv_2_weight]
+      %p_features_14_conv_3_weight : [num_users=1] = placeholder[target=p_features_14_conv_3_weight]
+      %p_features_14_conv_3_bias : [num_users=1] = placeholder[target=p_features_14_conv_3_bias]
+      %p_features_15_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_15_conv_0_0_weight]
+      %p_features_15_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_15_conv_0_1_weight]
+      %p_features_15_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_15_conv_0_1_bias]
+      %p_features_15_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_15_conv_1_0_weight]
+      %p_features_15_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_15_conv_1_1_weight]
+      %p_features_15_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_15_conv_1_1_bias]
+      %p_features_15_conv_2_weight : [num_users=1] = placeholder[target=p_features_15_conv_2_weight]
+      %p_features_15_conv_3_weight : [num_users=1] = placeholder[target=p_features_15_conv_3_weight]
+      %p_features_15_conv_3_bias : [num_users=1] = placeholder[target=p_features_15_conv_3_bias]
+      %p_features_16_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_16_conv_0_0_weight]
+      %p_features_16_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_16_conv_0_1_weight]
+      %p_features_16_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_16_conv_0_1_bias]
+      %p_features_16_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_16_conv_1_0_weight]
+      %p_features_16_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_16_conv_1_1_weight]
+      %p_features_16_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_16_conv_1_1_bias]
+      %p_features_16_conv_2_weight : [num_users=1] = placeholder[target=p_features_16_conv_2_weight]
+      %p_features_16_conv_3_weight : [num_users=1] = placeholder[target=p_features_16_conv_3_weight]
+      %p_features_16_conv_3_bias : [num_users=1] = placeholder[target=p_features_16_conv_3_bias]
+      %p_features_17_conv_0_0_weight : [num_users=1] = placeholder[target=p_features_17_conv_0_0_weight]
+      %p_features_17_conv_0_1_weight : [num_users=1] = placeholder[target=p_features_17_conv_0_1_weight]
+      %p_features_17_conv_0_1_bias : [num_users=1] = placeholder[target=p_features_17_conv_0_1_bias]
+      %p_features_17_conv_1_0_weight : [num_users=1] = placeholder[target=p_features_17_conv_1_0_weight]
+      %p_features_17_conv_1_1_weight : [num_users=1] = placeholder[target=p_features_17_conv_1_1_weight]
+      %p_features_17_conv_1_1_bias : [num_users=1] = placeholder[target=p_features_17_conv_1_1_bias]
+      %p_features_17_conv_2_weight : [num_users=1] = placeholder[target=p_features_17_conv_2_weight]
+      %p_features_17_conv_3_weight : [num_users=1] = placeholder[target=p_features_17_conv_3_weight]
+      %p_features_17_conv_3_bias : [num_users=1] = placeholder[target=p_features_17_conv_3_bias]
+      %p_features_18_0_weight : [num_users=1] = placeholder[target=p_features_18_0_weight]
+      %p_features_18_1_weight : [num_users=1] = placeholder[target=p_features_18_1_weight]
+      %p_features_18_1_bias : [num_users=1] = placeholder[target=p_features_18_1_bias]
+      %b_features_0_1_running_mean : [num_users=1] = placeholder[target=b_features_0_1_running_mean]
+      %b_features_0_1_running_var : [num_users=1] = placeholder[target=b_features_0_1_running_var]
+      %b_features_1_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_1_conv_0_1_running_mean]
+      %b_features_1_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_1_conv_0_1_running_var]
+      %b_features_1_conv_2_running_mean : [num_users=1] = placeholder[target=b_features_1_conv_2_running_mean]
+      %b_features_1_conv_2_running_var : [num_users=1] = placeholder[target=b_features_1_conv_2_running_var]
+      %b_features_2_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_2_conv_0_1_running_mean]
+      %b_features_2_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_2_conv_0_1_running_var]
+      %b_features_2_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_2_conv_1_1_running_mean]
+      %b_features_2_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_2_conv_1_1_running_var]
+      %b_features_2_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_2_conv_3_running_mean]
+      %b_features_2_conv_3_running_var : [num_users=1] = placeholder[target=b_features_2_conv_3_running_var]
+      %b_features_3_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_3_conv_0_1_running_mean]
+      %b_features_3_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_3_conv_0_1_running_var]
+      %b_features_3_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_3_conv_1_1_running_mean]
+      %b_features_3_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_3_conv_1_1_running_var]
+      %b_features_3_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_3_conv_3_running_mean]
+      %b_features_3_conv_3_running_var : [num_users=1] = placeholder[target=b_features_3_conv_3_running_var]
+      %b_features_4_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_4_conv_0_1_running_mean]
+      %b_features_4_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_4_conv_0_1_running_var]
+      %b_features_4_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_4_conv_1_1_running_mean]
+      %b_features_4_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_4_conv_1_1_running_var]
+      %b_features_4_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_4_conv_3_running_mean]
+      %b_features_4_conv_3_running_var : [num_users=1] = placeholder[target=b_features_4_conv_3_running_var]
+      %b_features_5_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_5_conv_0_1_running_mean]
+      %b_features_5_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_5_conv_0_1_running_var]
+      %b_features_5_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_5_conv_1_1_running_mean]
+      %b_features_5_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_5_conv_1_1_running_var]
+      %b_features_5_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_5_conv_3_running_mean]
+      %b_features_5_conv_3_running_var : [num_users=1] = placeholder[target=b_features_5_conv_3_running_var]
+      %b_features_6_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_6_conv_0_1_running_mean]
+      %b_features_6_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_6_conv_0_1_running_var]
+      %b_features_6_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_6_conv_1_1_running_mean]
+      %b_features_6_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_6_conv_1_1_running_var]
+      %b_features_6_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_6_conv_3_running_mean]
+      %b_features_6_conv_3_running_var : [num_users=1] = placeholder[target=b_features_6_conv_3_running_var]
+      %b_features_7_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_7_conv_0_1_running_mean]
+      %b_features_7_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_7_conv_0_1_running_var]
+      %b_features_7_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_7_conv_1_1_running_mean]
+      %b_features_7_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_7_conv_1_1_running_var]
+      %b_features_7_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_7_conv_3_running_mean]
+      %b_features_7_conv_3_running_var : [num_users=1] = placeholder[target=b_features_7_conv_3_running_var]
+      %b_features_8_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_8_conv_0_1_running_mean]
+      %b_features_8_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_8_conv_0_1_running_var]
+      %b_features_8_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_8_conv_1_1_running_mean]
+      %b_features_8_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_8_conv_1_1_running_var]
+      %b_features_8_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_8_conv_3_running_mean]
+      %b_features_8_conv_3_running_var : [num_users=1] = placeholder[target=b_features_8_conv_3_running_var]
+      %b_features_9_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_9_conv_0_1_running_mean]
+      %b_features_9_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_9_conv_0_1_running_var]
+      %b_features_9_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_9_conv_1_1_running_mean]
+      %b_features_9_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_9_conv_1_1_running_var]
+      %b_features_9_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_9_conv_3_running_mean]
+      %b_features_9_conv_3_running_var : [num_users=1] = placeholder[target=b_features_9_conv_3_running_var]
+      %b_features_10_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_10_conv_0_1_running_mean]
+      %b_features_10_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_10_conv_0_1_running_var]
+      %b_features_10_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_10_conv_1_1_running_mean]
+      %b_features_10_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_10_conv_1_1_running_var]
+      %b_features_10_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_10_conv_3_running_mean]
+      %b_features_10_conv_3_running_var : [num_users=1] = placeholder[target=b_features_10_conv_3_running_var]
+      %b_features_11_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_11_conv_0_1_running_mean]
+      %b_features_11_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_11_conv_0_1_running_var]
+      %b_features_11_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_11_conv_1_1_running_mean]
+      %b_features_11_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_11_conv_1_1_running_var]
+      %b_features_11_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_11_conv_3_running_mean]
+      %b_features_11_conv_3_running_var : [num_users=1] = placeholder[target=b_features_11_conv_3_running_var]
+      %b_features_12_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_12_conv_0_1_running_mean]
+      %b_features_12_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_12_conv_0_1_running_var]
+      %b_features_12_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_12_conv_1_1_running_mean]
+      %b_features_12_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_12_conv_1_1_running_var]
+      %b_features_12_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_12_conv_3_running_mean]
+      %b_features_12_conv_3_running_var : [num_users=1] = placeholder[target=b_features_12_conv_3_running_var]
+      %b_features_13_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_13_conv_0_1_running_mean]
+      %b_features_13_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_13_conv_0_1_running_var]
+      %b_features_13_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_13_conv_1_1_running_mean]
+      %b_features_13_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_13_conv_1_1_running_var]
+      %b_features_13_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_13_conv_3_running_mean]
+      %b_features_13_conv_3_running_var : [num_users=1] = placeholder[target=b_features_13_conv_3_running_var]
+      %b_features_14_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_14_conv_0_1_running_mean]
+      %b_features_14_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_14_conv_0_1_running_var]
+      %b_features_14_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_14_conv_1_1_running_mean]
+      %b_features_14_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_14_conv_1_1_running_var]
+      %b_features_14_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_14_conv_3_running_mean]
+      %b_features_14_conv_3_running_var : [num_users=1] = placeholder[target=b_features_14_conv_3_running_var]
+      %b_features_15_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_15_conv_0_1_running_mean]
+      %b_features_15_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_15_conv_0_1_running_var]
+      %b_features_15_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_15_conv_1_1_running_mean]
+      %b_features_15_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_15_conv_1_1_running_var]
+      %b_features_15_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_15_conv_3_running_mean]
+      %b_features_15_conv_3_running_var : [num_users=1] = placeholder[target=b_features_15_conv_3_running_var]
+      %b_features_16_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_16_conv_0_1_running_mean]
+      %b_features_16_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_16_conv_0_1_running_var]
+      %b_features_16_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_16_conv_1_1_running_mean]
+      %b_features_16_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_16_conv_1_1_running_var]
+      %b_features_16_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_16_conv_3_running_mean]
+      %b_features_16_conv_3_running_var : [num_users=1] = placeholder[target=b_features_16_conv_3_running_var]
+      %b_features_17_conv_0_1_running_mean : [num_users=1] = placeholder[target=b_features_17_conv_0_1_running_mean]
+      %b_features_17_conv_0_1_running_var : [num_users=1] = placeholder[target=b_features_17_conv_0_1_running_var]
+      %b_features_17_conv_1_1_running_mean : [num_users=1] = placeholder[target=b_features_17_conv_1_1_running_mean]
+      %b_features_17_conv_1_1_running_var : [num_users=1] = placeholder[target=b_features_17_conv_1_1_running_var]
+      %b_features_17_conv_3_running_mean : [num_users=1] = placeholder[target=b_features_17_conv_3_running_mean]
+      %b_features_17_conv_3_running_var : [num_users=1] = placeholder[target=b_features_17_conv_3_running_var]
+      %b_features_18_1_running_mean : [num_users=1] = placeholder[target=b_features_18_1_running_mean]
+      %b_features_18_1_running_var : [num_users=1] = placeholder[target=b_features_18_1_running_var]
+      %x : [num_users=1] = placeholder[target=x]
+      %aten_convolution_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%x, %p_features_0_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default, %p_features_0_1_weight, %p_features_0_1_bias, %b_features_0_1_running_mean, %b_features_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default, 0), kwargs = {})
+      %aten_hardtanh_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default, %p_features_1_conv_0_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 32), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_1, %p_features_1_conv_0_1_weight, %p_features_1_conv_0_1_bias, %b_features_1_conv_0_1_running_mean, %b_features_1_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_1, 0), kwargs = {})
+      %aten_hardtanh_default_1 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_1, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_1, %p_features_1_conv_1_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_2, %p_features_1_conv_2_weight, %p_features_1_conv_2_bias, %b_features_1_conv_2_running_mean, %b_features_1_conv_2_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_2 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_2, 0), kwargs = {})
+      %aten_convolution_default_3 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_2, %p_features_2_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_3 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_3, %p_features_2_conv_0_1_weight, %p_features_2_conv_0_1_bias, %b_features_2_conv_0_1_running_mean, %b_features_2_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_3 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_3, 0), kwargs = {})
+      %aten_hardtanh_default_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_3, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_4 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_2, %p_features_2_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 96), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_4 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_4, %p_features_2_conv_1_1_weight, %p_features_2_conv_1_1_bias, %b_features_2_conv_1_1_running_mean, %b_features_2_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_4 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_4, 0), kwargs = {})
+      %aten_hardtanh_default_3 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_4, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_3, %p_features_2_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_5, %p_features_2_conv_3_weight, %p_features_2_conv_3_bias, %b_features_2_conv_3_running_mean, %b_features_2_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_5 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_5, 0), kwargs = {})
+      %aten_convolution_default_6 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_5, %p_features_3_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_6 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_6, %p_features_3_conv_0_1_weight, %p_features_3_conv_0_1_bias, %b_features_3_conv_0_1_running_mean, %b_features_3_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_6 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_6, 0), kwargs = {})
+      %aten_hardtanh_default_4 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_6, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_4, %p_features_3_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 144), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_7, %p_features_3_conv_1_1_weight, %p_features_3_conv_1_1_bias, %b_features_3_conv_1_1_running_mean, %b_features_3_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_7 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_7, 0), kwargs = {})
+      %aten_hardtanh_default_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_7, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_8 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_5, %p_features_3_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_8 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_8, %p_features_3_conv_3_weight, %p_features_3_conv_3_bias, %b_features_3_conv_3_running_mean, %b_features_3_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_8 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_8, 0), kwargs = {})
+      %aten_add_tensor : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_5, %getitem_8), kwargs = {})
+      %aten_convolution_default_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor, %p_features_4_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_9, %p_features_4_conv_0_1_weight, %p_features_4_conv_0_1_bias, %b_features_4_conv_0_1_running_mean, %b_features_4_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_9 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_9, 0), kwargs = {})
+      %aten_hardtanh_default_6 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_9, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_10 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_6, %p_features_4_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 144), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_10 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_10, %p_features_4_conv_1_1_weight, %p_features_4_conv_1_1_bias, %b_features_4_conv_1_1_running_mean, %b_features_4_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_10 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_10, 0), kwargs = {})
+      %aten_hardtanh_default_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_10, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_11 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_7, %p_features_4_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_11 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_11, %p_features_4_conv_3_weight, %p_features_4_conv_3_bias, %b_features_4_conv_3_running_mean, %b_features_4_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_11 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_11, 0), kwargs = {})
+      %aten_convolution_default_12 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_11, %p_features_5_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_12 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_12, %p_features_5_conv_0_1_weight, %p_features_5_conv_0_1_bias, %b_features_5_conv_0_1_running_mean, %b_features_5_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_12 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_12, 0), kwargs = {})
+      %aten_hardtanh_default_8 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_12, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_13 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_8, %p_features_5_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 192), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_13 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_13, %p_features_5_conv_1_1_weight, %p_features_5_conv_1_1_bias, %b_features_5_conv_1_1_running_mean, %b_features_5_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_13 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_13, 0), kwargs = {})
+      %aten_hardtanh_default_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_13, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_14 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_9, %p_features_5_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_14 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_14, %p_features_5_conv_3_weight, %p_features_5_conv_3_bias, %b_features_5_conv_3_running_mean, %b_features_5_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_14 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_14, 0), kwargs = {})
+      %aten_add_tensor_1 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_11, %getitem_14), kwargs = {})
+      %aten_convolution_default_15 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_1, %p_features_6_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_15 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_15, %p_features_6_conv_0_1_weight, %p_features_6_conv_0_1_bias, %b_features_6_conv_0_1_running_mean, %b_features_6_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_15 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_15, 0), kwargs = {})
+      %aten_hardtanh_default_10 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_15, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_16 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_10, %p_features_6_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 192), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_16 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_16, %p_features_6_conv_1_1_weight, %p_features_6_conv_1_1_bias, %b_features_6_conv_1_1_running_mean, %b_features_6_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_16 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_16, 0), kwargs = {})
+      %aten_hardtanh_default_11 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_16, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_17 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_11, %p_features_6_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_17 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_17, %p_features_6_conv_3_weight, %p_features_6_conv_3_bias, %b_features_6_conv_3_running_mean, %b_features_6_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_17 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_17, 0), kwargs = {})
+      %aten_add_tensor_2 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_1, %getitem_17), kwargs = {})
+      %aten_convolution_default_18 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_2, %p_features_7_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_18 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_18, %p_features_7_conv_0_1_weight, %p_features_7_conv_0_1_bias, %b_features_7_conv_0_1_running_mean, %b_features_7_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_18 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_18, 0), kwargs = {})
+      %aten_hardtanh_default_12 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_18, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_19 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_12, %p_features_7_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 192), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_19 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_19, %p_features_7_conv_1_1_weight, %p_features_7_conv_1_1_bias, %b_features_7_conv_1_1_running_mean, %b_features_7_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_19 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_19, 0), kwargs = {})
+      %aten_hardtanh_default_13 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_19, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_20 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_13, %p_features_7_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_20 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_20, %p_features_7_conv_3_weight, %p_features_7_conv_3_bias, %b_features_7_conv_3_running_mean, %b_features_7_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_20 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_20, 0), kwargs = {})
+      %aten_convolution_default_21 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_20, %p_features_8_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_21 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_21, %p_features_8_conv_0_1_weight, %p_features_8_conv_0_1_bias, %b_features_8_conv_0_1_running_mean, %b_features_8_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_21 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_21, 0), kwargs = {})
+      %aten_hardtanh_default_14 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_21, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_14, %p_features_8_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_22, %p_features_8_conv_1_1_weight, %p_features_8_conv_1_1_bias, %b_features_8_conv_1_1_running_mean, %b_features_8_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_22 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_22, 0), kwargs = {})
+      %aten_hardtanh_default_15 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_22, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_15, %p_features_8_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_23, %p_features_8_conv_3_weight, %p_features_8_conv_3_bias, %b_features_8_conv_3_running_mean, %b_features_8_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_23 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_23, 0), kwargs = {})
+      %aten_add_tensor_3 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_20, %getitem_23), kwargs = {})
+      %aten_convolution_default_24 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_3, %p_features_9_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_24 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_24, %p_features_9_conv_0_1_weight, %p_features_9_conv_0_1_bias, %b_features_9_conv_0_1_running_mean, %b_features_9_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_24 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_24, 0), kwargs = {})
+      %aten_hardtanh_default_16 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_24, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_25 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_16, %p_features_9_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_25 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_25, %p_features_9_conv_1_1_weight, %p_features_9_conv_1_1_bias, %b_features_9_conv_1_1_running_mean, %b_features_9_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_25 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_25, 0), kwargs = {})
+      %aten_hardtanh_default_17 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_25, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_26 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_17, %p_features_9_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_26 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_26, %p_features_9_conv_3_weight, %p_features_9_conv_3_bias, %b_features_9_conv_3_running_mean, %b_features_9_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_26 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_26, 0), kwargs = {})
+      %aten_add_tensor_4 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_3, %getitem_26), kwargs = {})
+      %aten_convolution_default_27 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_4, %p_features_10_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_27 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_27, %p_features_10_conv_0_1_weight, %p_features_10_conv_0_1_bias, %b_features_10_conv_0_1_running_mean, %b_features_10_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_27 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_27, 0), kwargs = {})
+      %aten_hardtanh_default_18 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_27, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_28 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_18, %p_features_10_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_28 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_28, %p_features_10_conv_1_1_weight, %p_features_10_conv_1_1_bias, %b_features_10_conv_1_1_running_mean, %b_features_10_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_28 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_28, 0), kwargs = {})
+      %aten_hardtanh_default_19 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_28, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_29 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_19, %p_features_10_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_29 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_29, %p_features_10_conv_3_weight, %p_features_10_conv_3_bias, %b_features_10_conv_3_running_mean, %b_features_10_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_29 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_29, 0), kwargs = {})
+      %aten_add_tensor_5 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_4, %getitem_29), kwargs = {})
+      %aten_convolution_default_30 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_5, %p_features_11_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_30 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_30, %p_features_11_conv_0_1_weight, %p_features_11_conv_0_1_bias, %b_features_11_conv_0_1_running_mean, %b_features_11_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_30 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_30, 0), kwargs = {})
+      %aten_hardtanh_default_20 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_30, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_31 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_20, %p_features_11_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 384), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_31 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_31, %p_features_11_conv_1_1_weight, %p_features_11_conv_1_1_bias, %b_features_11_conv_1_1_running_mean, %b_features_11_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_31 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_31, 0), kwargs = {})
+      %aten_hardtanh_default_21 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_31, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_32 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_21, %p_features_11_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_32 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_32, %p_features_11_conv_3_weight, %p_features_11_conv_3_bias, %b_features_11_conv_3_running_mean, %b_features_11_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_32 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_32, 0), kwargs = {})
+      %aten_convolution_default_33 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_32, %p_features_12_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_33 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_33, %p_features_12_conv_0_1_weight, %p_features_12_conv_0_1_bias, %b_features_12_conv_0_1_running_mean, %b_features_12_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_33 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_33, 0), kwargs = {})
+      %aten_hardtanh_default_22 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_33, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_34 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_22, %p_features_12_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 576), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_34 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_34, %p_features_12_conv_1_1_weight, %p_features_12_conv_1_1_bias, %b_features_12_conv_1_1_running_mean, %b_features_12_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_34 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_34, 0), kwargs = {})
+      %aten_hardtanh_default_23 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_34, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_35 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_23, %p_features_12_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_35 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_35, %p_features_12_conv_3_weight, %p_features_12_conv_3_bias, %b_features_12_conv_3_running_mean, %b_features_12_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_35 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_35, 0), kwargs = {})
+      %aten_add_tensor_6 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_32, %getitem_35), kwargs = {})
+      %aten_convolution_default_36 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_6, %p_features_13_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_36 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_36, %p_features_13_conv_0_1_weight, %p_features_13_conv_0_1_bias, %b_features_13_conv_0_1_running_mean, %b_features_13_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_36 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_36, 0), kwargs = {})
+      %aten_hardtanh_default_24 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_36, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_37 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_24, %p_features_13_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 576), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_37 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_37, %p_features_13_conv_1_1_weight, %p_features_13_conv_1_1_bias, %b_features_13_conv_1_1_running_mean, %b_features_13_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_37 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_37, 0), kwargs = {})
+      %aten_hardtanh_default_25 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_37, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_38 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_25, %p_features_13_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_38 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_38, %p_features_13_conv_3_weight, %p_features_13_conv_3_bias, %b_features_13_conv_3_running_mean, %b_features_13_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_38 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_38, 0), kwargs = {})
+      %aten_add_tensor_7 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_6, %getitem_38), kwargs = {})
+      %aten_convolution_default_39 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_7, %p_features_14_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_39 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_39, %p_features_14_conv_0_1_weight, %p_features_14_conv_0_1_bias, %b_features_14_conv_0_1_running_mean, %b_features_14_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_39 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_39, 0), kwargs = {})
+      %aten_hardtanh_default_26 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_39, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_40 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_26, %p_features_14_conv_1_0_weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 576), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_40 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_40, %p_features_14_conv_1_1_weight, %p_features_14_conv_1_1_bias, %b_features_14_conv_1_1_running_mean, %b_features_14_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_40 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_40, 0), kwargs = {})
+      %aten_hardtanh_default_27 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_40, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_41 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_27, %p_features_14_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_41 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_41, %p_features_14_conv_3_weight, %p_features_14_conv_3_bias, %b_features_14_conv_3_running_mean, %b_features_14_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_41 : [num_users=2] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_41, 0), kwargs = {})
+      %aten_convolution_default_42 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_41, %p_features_15_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_42 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_42, %p_features_15_conv_0_1_weight, %p_features_15_conv_0_1_bias, %b_features_15_conv_0_1_running_mean, %b_features_15_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_42 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_42, 0), kwargs = {})
+      %aten_hardtanh_default_28 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_42, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_43 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_28, %p_features_15_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 960), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_43 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_43, %p_features_15_conv_1_1_weight, %p_features_15_conv_1_1_bias, %b_features_15_conv_1_1_running_mean, %b_features_15_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_43 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_43, 0), kwargs = {})
+      %aten_hardtanh_default_29 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_43, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_44 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_29, %p_features_15_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_44 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_44, %p_features_15_conv_3_weight, %p_features_15_conv_3_bias, %b_features_15_conv_3_running_mean, %b_features_15_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_44 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_44, 0), kwargs = {})
+      %aten_add_tensor_8 : [num_users=2] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%getitem_41, %getitem_44), kwargs = {})
+      %aten_convolution_default_45 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_8, %p_features_16_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_45 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_45, %p_features_16_conv_0_1_weight, %p_features_16_conv_0_1_bias, %b_features_16_conv_0_1_running_mean, %b_features_16_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_45 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_45, 0), kwargs = {})
+      %aten_hardtanh_default_30 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_45, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_46 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_30, %p_features_16_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 960), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_46 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_46, %p_features_16_conv_1_1_weight, %p_features_16_conv_1_1_bias, %b_features_16_conv_1_1_running_mean, %b_features_16_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_46 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_46, 0), kwargs = {})
+      %aten_hardtanh_default_31 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_46, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_47 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_31, %p_features_16_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_47 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_47, %p_features_16_conv_3_weight, %p_features_16_conv_3_bias, %b_features_16_conv_3_running_mean, %b_features_16_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_47 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_47, 0), kwargs = {})
+      %aten_add_tensor_9 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.add.Tensor](args = (%aten_add_tensor_8, %getitem_47), kwargs = {})
+      %aten_convolution_default_48 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_add_tensor_9, %p_features_17_conv_0_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_48 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_48, %p_features_17_conv_0_1_weight, %p_features_17_conv_0_1_bias, %b_features_17_conv_0_1_running_mean, %b_features_17_conv_0_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_48 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_48, 0), kwargs = {})
+      %aten_hardtanh_default_32 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_48, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_49 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_32, %p_features_17_conv_1_0_weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 960), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_49 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_49, %p_features_17_conv_1_1_weight, %p_features_17_conv_1_1_bias, %b_features_17_conv_1_1_running_mean, %b_features_17_conv_1_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_49 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_49, 0), kwargs = {})
+      %aten_hardtanh_default_33 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_49, 0.0, 6.0), kwargs = {})
+      %aten_convolution_default_50 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%aten_hardtanh_default_33, %p_features_17_conv_2_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_50 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_50, %p_features_17_conv_3_weight, %p_features_17_conv_3_bias, %b_features_17_conv_3_running_mean, %b_features_17_conv_3_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_50 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_50, 0), kwargs = {})
+      %aten_convolution_default_51 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%getitem_50, %p_features_18_0_weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
+      %aten__native_batch_norm_legit_no_training_default_51 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten._native_batch_norm_legit_no_training.default](args = (%aten_convolution_default_51, %p_features_18_1_weight, %p_features_18_1_bias, %b_features_18_1_running_mean, %b_features_18_1_running_var, 0.1, 1e-05), kwargs = {})
+      %getitem_51 : [num_users=1] = call_function[target=operator.getitem](args = (%aten__native_batch_norm_legit_no_training_default_51, 0), kwargs = {})
+      %aten_hardtanh_default_34 : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.hardtanh.default](args = (%getitem_51, 0.0, 6.0), kwargs = {})
+      %aten_mean_dim : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.mean.dim](args = (%aten_hardtanh_default_34, [-1, -2], True), kwargs = {})
+      return (aten_mean_dim,)
+  %executorch_call_delegate : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_0, %x), kwargs = {})
+  %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate, 0), kwargs = {})
+  %aten_view_copy_default : [num_users=1] = call_function[target=executorch.exir.memory.view](args = (%getitem, [1, 1280]), kwargs = {})
+  %alloc : [num_users=1] = call_function[target=executorch.exir.memory.alloc](args = (((1, 1280), torch.float32),), kwargs = {})
+  %dim_order_ops__clone_dim_order_default : [num_users=1] = call_function[target=torch.ops.dim_order_ops._clone_dim_order.out](args = (%aten_view_copy_default,), kwargs = {dim_order: [0, 1], out: %alloc})
+  %lowered_module_1 : [num_users=1] = get_attr[target=lowered_module_1]
+    backend_id: XnnpackBackend
+    lowered graph():
+      %p_classifier_1_weight : [num_users=1] = placeholder[target=p_classifier_1_weight]
+      %p_classifier_1_bias : [num_users=1] = placeholder[target=p_classifier_1_bias]
+      %dim_order_ops__clone_dim_order_default : [num_users=1] = placeholder[target=dim_order_ops__clone_dim_order_default]
+      %aten_linear_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.linear.default](args = (%dim_order_ops__clone_dim_order_default, %p_classifier_1_weight, %p_classifier_1_bias), kwargs = {})
+      return (aten_linear_default,)
+  %executorch_call_delegate_1 : [num_users=1] = call_function[target=torch.ops.higher_order.executorch_call_delegate](args = (%lowered_module_1, %dim_order_ops__clone_dim_order_default), kwargs = {})
+  %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%executorch_call_delegate_1, 0), kwargs = {})
+  return (getitem_1,)
 ```
+</details>
diff --git a/docs/source/desktop-backends.md b/docs/source/desktop-backends.md
new file mode 100644
index 00000000000..e4220edb47f
--- /dev/null
+++ b/docs/source/desktop-backends.md
@@ -0,0 +1,27 @@
+(desktop-backends)=
+# Backends
+
+Available hardware acceleration backends for desktop platforms.
+
+## Linux Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+## macOS Backends
+
+- {doc}`desktop-coreml` — CoreML (recommended for Apple Silicon)
+- {doc}`desktop-mps` — Metal Performance Shaders (Apple Silicon GPU)
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+
+## Windows Backends
+
+- {doc}`desktop-xnnpack` — XNNPACK (CPU acceleration)
+- {doc}`desktop-openvino` — OpenVINO (Intel hardware optimization)
+
+```{toctree}
+:hidden:
+desktop-xnnpack
+desktop-openvino
+desktop-coreml
+desktop-mps
diff --git a/docs/source/desktop-coreml.md b/docs/source/desktop-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/desktop-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/desktop-mps.md b/docs/source/desktop-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/desktop-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/desktop-openvino.md b/docs/source/desktop-openvino.md
new file mode 100644
index 00000000000..a0fd5774c73
--- /dev/null
+++ b/docs/source/desktop-openvino.md
@@ -0,0 +1 @@
+```{include} build-run-openvino.md
diff --git a/docs/source/desktop-section.md b/docs/source/desktop-section.md
new file mode 100644
index 00000000000..7afccbe1d4f
--- /dev/null
+++ b/docs/source/desktop-section.md
@@ -0,0 +1,19 @@
+(desktop-section)=
+# Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends for each platform.
+
+## Platform Overview & Runtime
+
+- {doc}`using-executorch-cpp` — C++ runtime integration guide
+- {doc}`using-executorch-building-from-source` — Building ExecuTorch from source
+
+## Backends
+
+- {doc}`desktop-backends` — Available desktop backends and platform-specific optimization
+
+```{toctree}
+:hidden:
+using-executorch-cpp
+using-executorch-building-from-source
+desktop-backends
diff --git a/docs/source/desktop-xnnpack.md b/docs/source/desktop-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/desktop-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/developer-tools.md b/docs/source/developer-tools.md
new file mode 100644
index 00000000000..d3b90b7adc8
--- /dev/null
+++ b/docs/source/developer-tools.md
@@ -0,0 +1,16 @@
+# Tools
+
+```{toctree}
+:maxdepth: 1
+
+devtools-overview
+bundled-io
+etrecord
+etdump
+runtime-profiling
+model-debugging
+model-inspector
+memory-planning-inspection
+delegate-debugging
+devtools-tutorial
+```
diff --git a/docs/source/devtools-overview.md b/docs/source/devtools-overview.md
index 449dd1485dc..8e13e67f1a1 100644
--- a/docs/source/devtools-overview.md
+++ b/docs/source/devtools-overview.md
@@ -41,6 +41,6 @@ More details are available in the [ETDump documentation](etdump.md) on how to ge
 
 
 ### Inspector APIs
-The Inspector Python APIs are the main user enrty point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
+The Inspector Python APIs are the main user entry point into the Developer Tools. They join the data sourced from ETDump and ETRecord to give users access to all the performance and debug data sourced from the runtime along with linkage back to eager model source code and module hierarchy in an easy to use API.
 
 More details are available in the [Inspector API documentation](model-inspector.rst) on how to use the Inspector APIs.
diff --git a/docs/source/devtools-tutorial.md b/docs/source/devtools-tutorial.md
index 7c6cedc311b..6d540dc7f35 100644
--- a/docs/source/devtools-tutorial.md
+++ b/docs/source/devtools-tutorial.md
@@ -1,3 +1,3 @@
 ## Developer Tools Usage Tutorial
 
-Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
+Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for a walkthrough on how to profile a model in ExecuTorch using the Developer Tools.
diff --git a/docs/source/edge-platforms-section.md b/docs/source/edge-platforms-section.md
new file mode 100644
index 00000000000..99e44093544
--- /dev/null
+++ b/docs/source/edge-platforms-section.md
@@ -0,0 +1,73 @@
+(edge-platforms-section)=
+# Edge
+
+Deploy ExecuTorch on mobile, desktop, and embedded platforms with optimized backends for each.
+
+ExecuTorch supports deployment across a wide variety of edge computing platforms, from high-end mobile devices to constrained embedded systems and microcontrollers.
+
+## Android
+
+Deploy ExecuTorch on Android devices with hardware acceleration support.
+
+**→ {doc}`android-section` — Complete Android deployment guide**
+
+Key features:
+- Hardware acceleration support (CPU, GPU, NPU)
+- Multiple backend options (XNNPACK, Vulkan, Qualcomm, MediaTek, ARM, Samsung)
+- Comprehensive examples and demos
+
+## iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+**→ {doc}`ios-section` — Complete iOS deployment guide**
+
+Key features:
+- Apple hardware optimization (CoreML, MPS, XNNPACK)
+- Swift and Objective-C integration
+- LLM and computer vision examples
+
+## Desktop & Laptop Platforms
+
+Deploy ExecuTorch on Linux, macOS, and Windows with optimized backends.
+
+**→ {doc}`desktop-section` — Complete desktop deployment guide**
+
+Key features:
+- Cross-platform C++ runtime
+- Platform-specific optimization (OpenVINO, CoreML, MPS)
+- CPU and GPU acceleration options
+
+## Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+**→ {doc}`embedded-section` — Complete embedded deployment guide**
+
+Key features:
+
+- Resource-constrained deployment
+- DSP and NPU acceleration (Cadence, ARM Ethos-U, NXP)
+- Custom backend development support
+- LLM and computer vision examples
+
+## Troubleshooting & Support
+
+- **{doc}`using-executorch-troubleshooting`** - Common issues and solutions across all platforms
+
+## Next Steps
+
+After choosing your platform:
+- **{doc}`backends-section`** - Deep dive into backend selection and optimization
+- **{doc}`llm/working-with-llms`** - Working with Large Language Models on edge devices
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Edge Platforms
+
+android-section
+ios-section
+desktop-section
+embedded-section
+using-executorch-troubleshooting
diff --git a/docs/source/embedded-arm-ethos-u.md b/docs/source/embedded-arm-ethos-u.md
new file mode 100644
index 00000000000..cdc544a6553
--- /dev/null
+++ b/docs/source/embedded-arm-ethos-u.md
@@ -0,0 +1 @@
+```{include} backends-arm-ethos-u.md
diff --git a/docs/source/embedded-backends.md b/docs/source/embedded-backends.md
new file mode 100644
index 00000000000..4ed7962ef42
--- /dev/null
+++ b/docs/source/embedded-backends.md
@@ -0,0 +1,20 @@
+(embedded-backends)=
+# Backends
+
+Available hardware acceleration backends for embedded systems.
+
+## DSP Acceleration
+
+- {doc}`embedded-cadence` — Cadence Xtensa DSP processors
+
+## NPU Acceleration
+
+- {doc}`embedded-arm-ethos-u` — ARM Ethos-U NPU acceleration
+- {doc}`embedded-nxp` — NXP eIQ Neutron Backend
+
+
+```{toctree}
+:hidden:
+embedded-cadence
+embedded-arm-ethos-u
+embedded-nxp
diff --git a/docs/source/embedded-cadence.md b/docs/source/embedded-cadence.md
new file mode 100644
index 00000000000..d2f7ea78259
--- /dev/null
+++ b/docs/source/embedded-cadence.md
@@ -0,0 +1 @@
+```{include} backends-cadence.md
diff --git a/docs/source/embedded-nxp.md b/docs/source/embedded-nxp.md
new file mode 100644
index 00000000000..35d8f0ab75d
--- /dev/null
+++ b/docs/source/embedded-nxp.md
@@ -0,0 +1 @@
+```{include} backends-nxp.md
diff --git a/docs/source/embedded-section.md b/docs/source/embedded-section.md
new file mode 100644
index 00000000000..834001afbc3
--- /dev/null
+++ b/docs/source/embedded-section.md
@@ -0,0 +1,39 @@
+(embedded-section)=
+
+# Embedded Systems
+
+Deploy ExecuTorch on constrained embedded systems and microcontrollers.
+
+## API Reference & Development
+
+Start here for C++ development with ExecuTorch runtime APIs and essential tutorials.
+
+- {doc}`executorch-runtime-api-reference` — **Start here**: Complete runtime API reference for embedded development
+- {doc}`running-a-model-cpp-tutorial` — Step-by-step C++ API tutorial with practical examples
+- {doc}`extension-module` — Custom module extensions for specialized functionality
+- {doc}`extension-tensor` — Tensor operations and memory management extensions
+
+## Build & Integration Guide
+
+- {doc}`using-executorch-cpp` — Complete setup guide for C++ runtime integration
+- {doc}`using-executorch-building-from-source` — Building from Source
+
+## Choose Backend for acceleration
+
+- {doc}`embedded-backends` — Available embedded backends and acceleration options
+
+## Tutorials
+
+- {doc}`tutorial-arm-ethos-u` — Export a simple PyTorch model for the ExecuTorch Ethos-U backend
+
+
+```{toctree}
+:hidden:
+executorch-runtime-api-reference
+running-a-model-cpp-tutorial
+extension-module
+extension-tensor
+using-executorch-cpp
+using-executorch-building-from-source
+embedded-backends
+tutorial-arm-ethos-u
diff --git a/docs/source/etrecord.rst b/docs/source/etrecord.rst
index 1ab84a6ee10..39bc45cab5a 100644
--- a/docs/source/etrecord.rst
+++ b/docs/source/etrecord.rst
@@ -23,13 +23,120 @@ It should be provided to the `Inspector API <model-inspector.html>`__ to link ba
 Generating an ``ETRecord``
 --------------------------
 
-The user should use the following API to generate an ``ETRecord`` file. They
-will be expected to provide the Edge Dialect program (returned by the call to ``to_edge()``),
-the ExecuTorch program (returned by the call to ``to_executorch()``), and optional models that
-they are interested in working with via our tooling.
+There are multiple ways to generate an ``ETRecord`` for debugging purposes:
+
+Method 1: Using the ``generate_etrecord`` Parameter (Recommended)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The recommended approach is to enable ``ETRecord`` generation by passing ``generate_etrecord=True``
+to your export API calls. This can be used with:
+
+* ``executorch.export()`` - High-level export API
+* ``to_edge()`` - Edge dialect conversion
+* ``to_edge_transform_and_lower()`` - Edge conversion with transformations and lowering
+
+After export completes, retrieve the ``ETRecord`` using the ``get_etrecord()`` method, and save it using the ``save()`` method:
+
+**Example with** ``executorch.export()``:
+
+.. code-block:: python
+
+    import executorch
+    from executorch.export import ExportRecipe
+
+    # Export with ETRecord generation enabled
+    session = executorch.export(
+        model=model,
+        example_inputs=[example_inputs],
+        export_recipe=recipe,
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    # Get and save the ETRecord
+    etrecord = session.get_etrecord()
+    etrecord.save("model_debug.etrecord")
+
+**Example with** ``to_edge()``:
+
+.. code-block:: python
+
+    from executorch.exir.program import to_edge
+    from torch.export import export
+
+    # Export model first
+    exported_program = export(model, example_inputs)
+
+    # Convert to edge with ETRecord generation
+    edge_manager = to_edge(
+        exported_program,
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    # Apply transformations
+    edge_manager = edge_manager.to_backend()
+    et_manager = edge_manager.to_executorch()
+
+    # Get and save ETRecord
+    etrecord = et_manager.get_etrecord()
+    etrecord.save("edge_debug.etrecord")
+
+**Example with** ``to_edge_transform_and_lower()``:
+
+.. code-block:: python
+
+    from executorch.exir.program import to_edge_transform_and_lower
+    from torch.export import export
+
+    # Export model first
+    exported_program = export(model, example_inputs)
+
+    # Transform and lower with ETRecord generation
+    edge_manager = to_edge_transform_and_lower(
+        exported_program,
+        partitioner=[MyPartitioner()],
+        generate_etrecord=True  # Enable ETRecord generation
+    )
+
+    et_manager = edge_manager.to_executorch()
+
+    # Get and save ETRecord
+    etrecord = et_manager.get_etrecord()
+    etrecord.save("debug.etrecord")
+
+Method 2: Using the ``generate_etrecord()`` Function
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can also use the standalone ``generate_etrecord()`` function to generate an ``ETRecord``.
+This method requires you to provide the Edge Dialect program (returned by ``to_edge()``),
+the ExecuTorch program (returned by ``to_executorch()``), and optional models.
 
 .. warning::
-    Users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+    When using the standalone function, users should do a deepcopy of the output of ``to_edge()`` and pass in the deepcopy to the ``generate_etrecord`` API. This is needed because the subsequent call, ``to_executorch()``, does an in-place mutation and will lose debug data in the process.
+
+**Example:**
+
+.. code-block:: python
+
+    import copy
+    from executorch.devtools import generate_etrecord
+    from torch.export import export
+
+    # Export and convert to edge
+    aten_dialect = export(model, example_inputs, strict=True)
+    edge_program = to_edge(aten_dialect)
+
+    # Create copy for ETRecord (needed because to_executorch modifies in-place)
+    edge_program_copy = copy.deepcopy(edge_program)
+
+    # Convert to ExecutorchProgramManager
+    executorch_program = edge_program_copy.to_executorch()
+
+    # Generate ETRecord separately
+    generate_etrecord(
+        "debug.etrecord",
+        edge_program,
+        executorch_program,
+    )
 
 .. currentmodule:: executorch.devtools.etrecord._etrecord
 .. autofunction:: generate_etrecord
diff --git a/docs/source/examples.md b/docs/source/examples.md
new file mode 100644
index 00000000000..6a3a8ac29c9
--- /dev/null
+++ b/docs/source/examples.md
@@ -0,0 +1,9 @@
+# Examples
+
+```{toctree}
+:maxdepth: 1
+
+Building an ExecuTorch Android Demo App <https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app>
+Building an ExecuTorch iOS Demo App <https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo>
+tutorial-arm
+```
diff --git a/docs/source/executorch_custom_versions.py b/docs/source/executorch_custom_versions.py
deleted file mode 100644
index 590f21b10ec..00000000000
--- a/docs/source/executorch_custom_versions.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Sphinx extension to replace ${executorch_version:TAG} with version numbers.
-
-It also defines a special variable ${executorch_version} that is set to the value
-of `EXECUTORCH_VERSION` defined in this file.
-
-This custom extension pulls third-party version strings from files in the
-.ci/docker/ci_commit_pins directory, and uses them to expand specific strings in
-markdown files.
-
-For example, `${executorch_version:pytorch}` will be replaced with the
-appropriate pytorch version string used by CI.
-"""
-
-import os
-
-from docutils import nodes
-
-version_file_names = [
-    "buck2.txt",
-    "pytorch.txt",
-]
-
-EXECUTORCH_VERSION = "0.7.0"
-
-variables: dict[str, str] = {}
-
-
-def populate_version_variable():
-    variables["${executorch_version}"] = EXECUTORCH_VERSION
-    cwd = os.getcwd()
-    version_file_path = os.path.join(cwd, "..", ".ci", "docker", "ci_commit_pins")
-
-    for file_name in version_file_names:
-        file_path = os.path.join(version_file_path, file_name)
-        with open(file_path, "r") as f:
-            var_name = "${executorch_version:" + file_name.split(".")[0] + "}"
-            variables[var_name] = f.read().strip()
-
-
-populate_version_variable()
-
-
-def replace_variables(app, doctree, docname):
-    # Replace in regular text:
-    for node in doctree.traverse(nodes.Text):
-        new_text = node.astext()
-        for var, value in variables.items():
-            new_text = new_text.replace(var, value)
-        node.parent.replace(node, nodes.Text(new_text))
-    # Replace in code blocks:
-    for node in doctree.traverse(nodes.literal_block):
-        new_text = node.astext()
-        for var, value in variables.items():
-            new_text = new_text.replace(var, value)
-
-        classes = node.get("classes", [])
-        # check if the output is generated by sphinx-gallery and if yes, keep the original
-        # CSS classes. Otherwise, the sphinx-gallery generated outputs are
-        # formatted as regular code blocks with gray background instead of pink.
-        is_sphinx_gallery = any("sphx-glr" in class_ for class_ in classes)
-
-        language = node.get("language")
-
-        if is_sphinx_gallery:
-            new_literal_block = nodes.literal_block(new_text, new_text, classes=classes)
-        else:
-            new_literal_block = nodes.literal_block(
-                new_text,
-                new_text,
-                classes=["highlight-none", "notranslate"],
-                language=language,
-            )
-
-        node.parent.replace(node, new_literal_block)
-
-
-def setup(app):
-    app.connect("doctree-resolved", replace_variables)
diff --git a/docs/source/export-overview.md b/docs/source/export-overview.md
index d07701d06cd..c96716a0949 100644
--- a/docs/source/export-overview.md
+++ b/docs/source/export-overview.md
@@ -11,5 +11,5 @@ program, making it easier for you to understand and implement the process.
 
 To learn more about exporting your model:
 
-* Complete the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+* Complete the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 * Read the [torch.export documentation](https://pytorch.org/docs/2.1/export.html).
diff --git a/docs/source/extension-module.md b/docs/source/extension-module.md
index 29aa6712d37..690256fecbb 100644
--- a/docs/source/extension-module.md
+++ b/docs/source/extension-module.md
@@ -6,7 +6,7 @@ In the [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial.md), we
 
 ## Example
 
-Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
+Let's see how we can run the `SimpleConv` model generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> using the `Module` and [`TensorPtr`](extension-tensor.md) APIs:
 
 ```cpp
 #include <executorch/extension/module/module.h>
diff --git a/docs/source/file-formats-advanced.md b/docs/source/file-formats-advanced.md
new file mode 100644
index 00000000000..c16ebccfd65
--- /dev/null
+++ b/docs/source/file-formats-advanced.md
@@ -0,0 +1,17 @@
+(file-formats-advanced)=
+
+# File Formats
+
+ExecuTorch file format specifications and internal structure.
+
+## Program File Formats
+
+- {doc}`pte-file-format` — PTE (PyTorch ExecuTorch) file format specification
+- {doc}`ptd-file-format` — PTD file format specification
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+pte-file-format
+ptd-file-format
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
index ef4a12d1a7f..617d521b802 100644
--- a/docs/source/getting-started-architecture.md
+++ b/docs/source/getting-started-architecture.md
@@ -4,7 +4,7 @@ This page describes the technical architecture of ExecuTorch and its individual
 
 **Context**
 
-In order to target on-device AI with diverse hardware, critical power requirements, and realtime processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extendable architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms.
+In order to target on-device AI with diverse hardware, critical power requirements, and real-time processing needs, a single monolithic solution is not practical. Instead, a modular, layered, and extensible architecture is desired. ExecuTorch defines a streamlined workflow to prepare (export, transformation, and compilation) and execute a PyTorch program, with opinionated out-of-the-box default components and well-defined entry points for customizations. This architecture greatly improves portability, allowing engineers to use a performant lightweight, cross-platform runtime that easily integrates into different devices and platforms.
 
 ## Overview
 
@@ -89,6 +89,6 @@ _Executor_ is the entry point to load the program and execute it. The execution
 
 ## Developer Tools
 
-It should be efficient for users to go from research to production using the flow above. Productivity is essentially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
+It should be efficient for users to go from research to production using the flow above. Productivity is especially important, for users to author, optimize and deploy their models. We provide [ExecuTorch Developer Tools](devtools-overview.md) to improve productivity. The Developer Tools are not in the diagram. Instead it's a tool set that covers the developer workflow in all three phases.
 
 During the program preparation and execution, users can use the ExecuTorch Developer Tools to profile, debug, or visualize the program. Since the end-to-end flow is within the PyTorch ecosystem, users can correlate and display performance data along with graph visualization as well as direct references to the program source code and model hierarchy. We consider this to be a critical component for quickly iterating and lowering PyTorch programs to edge devices and environments.
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
index d3d9662f5c3..51c59f5e021 100644
--- a/docs/source/getting-started.md
+++ b/docs/source/getting-started.md
@@ -68,7 +68,7 @@ with open("model.pte", "wb") as f:
 
 If the model requires varying input sizes, you will need to specify the varying dimensions and bounds as part of the `export` call. See [Model Export and Lowering](using-executorch-export.md) for more information.
 
-The hardware backend to target is controlled by the partitioner parameter to to\_edge\_transform\_and\_lower. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
+The hardware backend to target is controlled by the partitioner parameter to `to_edge_transform_and_lower`. In this example, the XnnpackPartitioner is used to target mobile CPUs. See the [backend-specific documentation](backends-overview.md) for information on how to use each backend.
 
 Quantization can also be done at this stage to reduce model size and runtime. Quantization is backend-specific. See the documentation for the target backend for a full description of supported quantization schemes.
 
@@ -89,7 +89,7 @@ input_tensor: torch.Tensor = torch.randn(1, 3, 224, 224)
 program = runtime.load_program("model.pte")
 method = program.load_method("forward")
 output: List[torch.Tensor] = method.execute([input_tensor])
-print("Run succesfully via executorch")
+print("Run successfully via executorch")
 
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 import torchvision.models as models
@@ -226,5 +226,5 @@ ExecuTorch provides a high-degree of customizability to support diverse hardware
 - [Using ExecuTorch on Android](using-executorch-android.md) and [Using ExecuTorch on iOS](using-executorch-ios.md) for mobile runtime integration.
 - [Using ExecuTorch with C++](using-executorch-cpp.md) for embedded and mobile native development.
 - [Profiling and Debugging](using-executorch-troubleshooting.md) for developer tooling and debugging.
-- [API Reference](export-to-executorch-api-reference.md) for a full description of available APIs.
+- [API Reference](export-to-executorch-api-reference.rst) for a full description of available APIs.
 - [Examples](https://github.com/pytorch/executorch/tree/main/examples) for demo apps and example code.
diff --git a/docs/source/index.md b/docs/source/index.md
index 8afe4e85d78..b65139319a7 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -1,298 +1,195 @@
 (home)=
 # Welcome to the ExecuTorch Documentation
 
-**ExecuTorch** is PyTorch's solution to training and inference on the
-Edge.
+**ExecuTorch** is PyTorch's solution for efficient AI inference on edge devices — from mobile phones to embedded systems.
 
 ## Key Value Propositions
 
-- **Portability:** Compatibility with a wide variety of computing
-  platforms, from high-end mobile phones to highly constrained
-  embedded systems and microcontrollers.
-- **Productivity:** Enabling developers to use the same toolchains and
-  Developer Tools from PyTorch model authoring and conversion, to
-  debugging and deployment to a wide variety of platforms.
-- **Performance:** Providing end users with a seamless and
-  high-performance experience due to a lightweight runtime and
-  utilizing full hardware capabilities such as CPUs, NPUs, and DSPs.
-
-ExecuTorch provides support for:
-
-* **Strong Model Support** LLMs (Large Language Models),
-  CV (Computer Vision), ASR (Automatic Speech Recognition), TTS (Text To Speech)
-* **All Major Platforms** Android, Mac, Linux, Windows
-* **Rich Acceleration Support** Apple, Arm, Cadence, MediaTek, NXP, OpenVino, Qualcomm, Vulkan, XNNPACK
-
-### Documentation Navigation
-#### Introduction
-- [Overview](intro-overview)
-- [How it Works](intro-how-it-works)
-- [Getting Started with Architecture](getting-started-architecture)
-- [Concepts](concepts)
-#### Usage
-- [Getting Started](getting-started)
-- [Using Executorch Export](using-executorch-export)
-- [Using Executorch on Android](using-executorch-android)
-- [Using Executorch on iOS](using-executorch-ios)
-- [Using Executorch with C++](using-executorch-cpp)
-- [Runtime Integration](using-executorch-runtime-integration)
-- [Troubleshooting](using-executorch-troubleshooting)
-- [Building from Source](using-executorch-building-from-source)
-- [Quantization](quantization-overview)
-- [FAQs](using-executorch-faqs)
-#### Examples
-- [Android Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
-- [iOS Demo Apps](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
-- [Hugging Face Models](https://github.com/huggingface/optimum-executorch/blob/main/README.md)
-#### Backends
-- [Overview](backends-overview)
-- [XNNPACK](backends-xnnpack)
-- [Core ML](backends-coreml)
-- [MPS](backends-mps)
-- [Vulkan](backends-vulkan)
-- [ARM Ethos-U](backends-arm-ethos-u)
-- [ARM VGF](backends-arm-vgf)
-- [Qualcomm](backends-qualcomm)
-- [MediaTek](backends-mediatek)
-- [Cadence](backends-cadence)
-- [OpenVINO](build-run-openvino)
-- [NXP](backend-nxp)
-#### Developer Tools
-- [Overview](devtools-overview)
-- [Bundled IO](bundled-io)
-- [ETRecord](etrecord)
-- [ETDump](etdump)
-- [Runtime Profiling](runtime-profiling)
-- [Model Debugging](model-debugging)
-- [Model Inspector](model-inspector)
-- [Memory Planning Inspection](memory-planning-inspection)
-- [Delegate Debugging](delegate-debugging)
-- [Tutorial](devtools-tutorial)
-#### Runtime
-- [Overview](runtime-overview)
-- [Extension Module](extension-module)
-- [Extension Tensor](extension-tensor)
-- [Detailed C++ Runtime APIs Tutorial](running-a-model-cpp-tutorial)
-- [Backend Delegate Implementation and Linking](runtime-backend-delegate-implementation-and-linking)
-- [Platform Abstraction Layer](runtime-platform-abstraction-layer)
-#### Portable C++ Programming
-- [PTE File Format](pte-file-format)
-- [PTD File Format](ptd-file-format)
-#### API Reference
-- [Export to Executorch API Reference](export-to-executorch-api-reference)
-- [Executorch Runtime API Reference](executorch-runtime-api-reference)
-- [Runtime Python API Reference](runtime-python-api-reference)
-- [API Life Cycle](api-life-cycle)
-- [Javadoc](https://pytorch.org/executorch/main/javadoc/)
-#### Kernel Library
-- [Overview](kernel-library-overview)
-- [Custom ATen Kernel](kernel-library-custom-aten-kernel)
-- [Selective Build](kernel-library-selective-build)
-#### Working with LLMs
-- [Getting Started](llm/getting-started.md)
-- [Exporting LLMs](llm/export-llm.md)
-- [Exporting custom LLMs](llm/export-custom-llm.md)
-- [Running with C++](llm/run-with-c-plus-plus.md)
-- [Running on Android (XNNPack)](llm/llama-demo-android.md)
-- [Running on Android (QNN)](llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md)
-- [Running on iOS](llm/run-on-ios.md)
-#### Backend Development
-- [Delegates Integration](backend-delegates-integration)
-- [XNNPACK Reference](backend-delegates-xnnpack-reference)
-- [Dependencies](backend-delegates-dependencies)
-- [Compiler Delegate and Partitioner](compiler-delegate-and-partitioner)
-- [Debug Backend Delegate](debug-backend-delegate)
-#### IR Specification
-- [EXIR](ir-exir)
-- [Ops Set Definition](ir-ops-set-definition)
-#### Compiler Entry Points
-- [Backend Dialect](compiler-backend-dialect)
-- [Custom Compiler Passes](compiler-custom-compiler-passes)
-- [Memory Planning](compiler-memory-planning)
-#### Contributing
-- [Contributing](contributing)
+- **Portability:** Run on diverse platforms, from high-end mobile to constrained microcontrollers
+- **Performance:** Lightweight runtime with full hardware acceleration (CPU, GPU, NPU, DSP)
+- **Productivity:** Use familiar PyTorch tools from authoring to deployment
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Introduction
-:hidden:
+---
 
-intro-overview
-intro-how-it-works
-getting-started-architecture
-concepts
-```
+## 🎯 Wins & Success Stories
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Usage
-:hidden:
+::::{grid} 1
+:class-container: success-showcase
+:::{grid-item-card}
+:class-header: bg-primary text-white
+:class-body: text-center
+[View All Success Stories →](success-stories)
+:::
+::::
 
-getting-started
-using-executorch-export
-using-executorch-android
-using-executorch-ios
-using-executorch-cpp
-using-executorch-runtime-integration
-using-executorch-troubleshooting
-using-executorch-building-from-source
-using-executorch-faqs
-```
+---
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Examples
-:hidden:
+## Quick Navigation
 
-Building an ExecuTorch Android Demo App <https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app>
-Building an ExecuTorch iOS Demo App <https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo>
-tutorial-arm.md
-```
+::::{grid} 2
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Backends
-:hidden:
+:::{grid-item-card} **Get Started**
+:link: quick-start-section
+:link-type: doc
 
-backends-overview
-backends-xnnpack
-backends-coreml
-backends-mps
-backends-vulkan
-backends-arm-ethos-u
-backends-qualcomm
-backends-mediatek
-backends-cadence
-OpenVINO Backend <build-run-openvino>
-backends-nxp
-```
+New to ExecuTorch? Start here for installation and your first model deployment.
+:::
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Developer Tools
-:hidden:
+:::{grid-item-card} **Deploy on Edge Platforms**
+:link: edge-platforms-section
+:link-type: doc
 
-devtools-overview
-bundled-io
-etrecord
-etdump
-runtime-profiling
-model-debugging
-model-inspector
-memory-planning-inspection
-delegate-debugging
-devtools-tutorial
-```
+Deploy on Android, iOS, Laptops / Desktops and embedded platforms with optimized backends.
+:::
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Runtime
-:hidden:
+:::{grid-item-card} **Work with LLMs**
+:link: llm/working-with-llms
+:link-type: doc
 
-runtime-overview
-extension-module
-extension-tensor
-running-a-model-cpp-tutorial
-runtime-backend-delegate-implementation-and-linking
-runtime-platform-abstraction-layer
-portable-cpp-programming
-pte-file-format
-ptd-file-format
-```
+Export, optimize, and deploy Large Language Models on edge devices.
+:::
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: API Reference
-:hidden:
+:::{grid-item-card} 🔧 **Developer Tools**
+:link: tools-section
+:link-type: doc
 
-export-to-executorch-api-reference
-executorch-runtime-api-reference
-runtime-python-api-reference
-api-life-cycle
-Javadoc <https://pytorch.org/executorch/main/javadoc/>
-```
+Profile, debug, and inspect your models with comprehensive tooling.
+:::
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Quantization
-:hidden:
+::::
 
-quantization-overview
-```
+---
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Kernel Library
-:hidden:
+## Explore Documentation
 
-kernel-library-overview
-kernel-library-custom-aten-kernel
-kernel-library-selective-build
-```
+::::{grid} 1
+:::{grid-item-card} **Intro**
+:link: intro-section
+:link-type: doc
 
-```{toctree}
-:glob:
-:maxdepth: 2
-:caption: Working with LLMs
-:hidden:
+**Overview, architecture, and core concepts** — Understand how ExecuTorch works and its benefits
+:::
+::::
 
-Getting Started <llm/getting-started>
-Exporting LLMs with export_llm <llm/export-llm>
-Exporting custom LLMs <llm/export-custom-llm>
-Running with C++ <llm/run-with-c-plus-plus>
-Running on Android <XNNPack> <llm/llama-demo-android>
-Running on Android <QNN> <llm/build-run-llama3-qualcomm-ai-engine-direct-backend>
-Running on iOS <llm/run-on-ios>
-```
+::::{grid} 1
+:::{grid-item-card} **Quick Start**
+:link: quick-start-section
+:link-type: doc
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Backend Development
-:hidden:
+**Get started with ExecuTorch** — Install, export your first model, and run inference
+:::
+::::
 
-backend-delegates-integration
-backend-delegates-xnnpack-reference
-backend-delegates-dependencies
-compiler-delegate-and-partitioner
-debug-backend-delegate
-```
+::::{grid} 1
+:::{grid-item-card} **Edge**
+:link: edge-platforms-section
+:link-type: doc
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: IR Specification
-:hidden:
+**Android, iOS, Desktop, Embedded** — Platform-specific deployment guides and examples
+:::
+::::
 
-ir-exir
-ir-ops-set-definition
-```
+::::{grid} 1
+:::{grid-item-card} **Backends**
+:link: backends-section
+:link-type: doc
 
-```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Compiler Entry Points
-:hidden:
+**CPU, GPU, NPU/Accelerator backends** — Hardware acceleration and backend selection
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **LLMs**
+:link: llm/working-with-llms
+:link-type: doc
+
+**LLM export, optimization, and deployment** — Complete LLM workflow for edge devices
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Advanced**
+:link: advanced-topics-section
+:link-type: doc
+
+**Quantization, memory planning, custom passes** — Deep customization and optimization
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **Tools**
+:link: tools-section
+:link-type: doc
+
+**Developer tools, profiling, debugging** — Comprehensive development and debugging suite
+:::
+::::
 
-compiler-backend-dialect
-compiler-custom-compiler-passes
-compiler-memory-planning
-```
+::::{grid} 1
+:::{grid-item-card} **API**
+:link: api-section
+:link-type: doc
+
+**API Reference Usages & Examples** — Detailed Python, C++, and Java API references
+:::
+::::
+
+::::{grid} 1
+:::{grid-item-card} **💬 Support**
+:link: support-section
+:link-type: doc
+
+**FAQ, troubleshooting, contributing** — Get help and contribute to the project
+:::
+::::
+
+---
+
+## What's Supported
+
+::::{grid} 3
+
+:::{grid-item}
+**Model Types**
+
+- Large Language Models (LLMs)
+- Computer Vision (CV)
+- Speech Recognition (ASR)
+- Text-to-Speech (TTS)
+- More ...
+:::
+
+:::{grid-item}
+**Platforms**
+
+- Android & iOS
+- Linux, macOS, Windows
+- Embedded & MCUs
+- Go **→ {doc}`edge-platforms-section`**
+:::
+
+:::{grid-item}
+**Rich Acceleration**
+
+- CPU
+- GPU
+- NPU
+- DSP
+- Go **→ {doc}`backends-section`**
+:::
+
+::::
 
 ```{toctree}
-:glob:
-:maxdepth: 1
-:caption: Contributing
 :hidden:
+:maxdepth: 1
 
-contributing
-```
+intro-section
+quick-start-section
+edge-platforms-section
+backends-section
+llm/working-with-llms
+advanced-topics-section
+tools-section
+api-section
+support-section
diff --git a/docs/source/intro-how-it-works.md b/docs/source/intro-how-it-works.md
index 3e6d384a62f..3ced602fed4 100644
--- a/docs/source/intro-how-it-works.md
+++ b/docs/source/intro-how-it-works.md
@@ -6,7 +6,7 @@ At a high-level, there are three steps for running a PyTorch model with ExecuTor
 
 1. **Export the model.** The first step is to capture the PyTorch program as a graph, which is a new representation of the model that can be expressed in terms of a series of operators such as addition, multiplication, or convolution. This process safely preserves the semantics of the original PyTorch program. This representation is the first step to enable running the model on edge use cases that have low memory and/or low compute.
 1. **Compile the exported model to an ExecuTorch program.** Given an exported model from step 1, convert it to an executable format called an ExecuTorch program that the runtime can use for inference. This step provides entry points for various optimizations such as compressing the model (e.g., quantization) to reduce size and further compiling subgraphs down to on-device specialized hardware accelerators to improve latency. It also provides an entry point for memory planning, i.e. to efficiently plan the location of intermediate tensors to reduce the runtime memory footprint.
-1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enabled performant execution even on highly-constrained devices.
+1. **Run the ExecuTorch program on a target device.** Given an input--such as an image represented as an input activation tensor--the ExecuTorch runtime loads the ExecuTorch program, executes the instructions represented by the program, and computes an output. This step is efficient because (1) the runtime is lightweight and (2) an efficient execution plan has already been calculated in steps 1 and 2, making it possible to do performant inference. Furthermore, portability of the core runtime enables performant execution even on highly-constrained devices.
 
 This figure illustrates the three-step process of exporting a PyTorch program, compiling it into an ExecuTorch program that targets a specific hardware device, and finally executing the program on the device using the ExecuTorch runtime.
 ![name](_static/img/how-executorch-works-high-level.png)
diff --git a/docs/source/intro-overview.md b/docs/source/intro-overview.md
index 96c7982b8fe..be2fd468716 100644
--- a/docs/source/intro-overview.md
+++ b/docs/source/intro-overview.md
@@ -20,7 +20,7 @@ Key value propositions of ExecuTorch are:
 ## Why ExecuTorch?
 
 Supporting on-device AI presents unique challenges with diverse hardware,
-critical power requirements, low/no internet connectivity, and realtime
+critical power requirements, low/no internet connectivity, and real-time
 processing needs. These constraints have historically prevented or slowed down
 the creation of scalable and performant on-device AI solutions. We designed
 ExecuTorch, backed by our industry partners like Meta, Arm, Apple, and Qualcomm,
diff --git a/docs/source/intro-section.md b/docs/source/intro-section.md
new file mode 100644
index 00000000000..2f6f3c57c88
--- /dev/null
+++ b/docs/source/intro-section.md
@@ -0,0 +1,27 @@
+(intro-section)=
+
+# Intro
+
+Overview, architecture, and core concepts of ExecuTorch.
+
+ExecuTorch is PyTorch's solution for training and inference on the Edge, providing portability, productivity, and performance for edge computing platforms.
+
+## Getting Started with ExecuTorch
+
+New to ExecuTorch? Start with these foundational topics:
+
+- **{doc}`intro-overview`** - High-level overview of ExecuTorch capabilities
+- **{doc}`intro-how-it-works`** - Technical overview of the ExecuTorch workflow
+- **{doc}`getting-started-architecture`** - System architecture and components
+- **{doc}`concepts`** - Core concepts and terminology
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Introduction Topics
+
+intro-overview
+intro-how-it-works
+getting-started-architecture
+concepts
+```
diff --git a/docs/source/ios-backends.md b/docs/source/ios-backends.md
new file mode 100644
index 00000000000..cb186f53319
--- /dev/null
+++ b/docs/source/ios-backends.md
@@ -0,0 +1,19 @@
+(ios-backends)=
+# Backends
+
+Available hardware acceleration backends for iOS deployment.
+
+## Apple Hardware Acceleration (Recommended)
+
+- {doc}`ios-coreml` — CoreML (NPU/GPU, recommended for iOS)
+- {doc}`ios-mps` — Metal Performance Shaders (GPU)
+
+## CPU Acceleration
+
+- {doc}`ios-xnnpack` — XNNPACK (CPU acceleration)
+
+```{toctree}
+:hidden:
+ios-coreml
+ios-mps
+ios-xnnpack
diff --git a/docs/source/ios-coreml.md b/docs/source/ios-coreml.md
new file mode 100644
index 00000000000..48271326d87
--- /dev/null
+++ b/docs/source/ios-coreml.md
@@ -0,0 +1 @@
+```{include} backends-coreml.md
diff --git a/docs/source/ios-examples.md b/docs/source/ios-examples.md
new file mode 100644
index 00000000000..86acf3273a6
--- /dev/null
+++ b/docs/source/ios-examples.md
@@ -0,0 +1,4 @@
+# Examples & Demos
+
+- [iOS LLM Examples Repository](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple)
+- [MobileViT Demo App](https://github.com/meta-pytorch/executorch-examples/tree/main/mv3/apple/ExecuTorchDemo)
diff --git a/docs/source/ios-mps.md b/docs/source/ios-mps.md
new file mode 100644
index 00000000000..d6f305d33aa
--- /dev/null
+++ b/docs/source/ios-mps.md
@@ -0,0 +1 @@
+```{include} backends-mps.md
diff --git a/docs/source/ios-section.md b/docs/source/ios-section.md
new file mode 100644
index 00000000000..33c9a61ce1d
--- /dev/null
+++ b/docs/source/ios-section.md
@@ -0,0 +1,23 @@
+(ios-section)=
+# iOS
+
+Deploy ExecuTorch on iOS devices with Apple hardware acceleration.
+
+## Quick Start & Integration
+
+- {doc}`using-executorch-ios` — Complete iOS integration guide
+
+## Backends
+
+- {doc}`ios-backends` — Available iOS backends and acceleration options
+
+## Examples & Demos
+
+- {doc}`ios-examples` — Explore iOS Examples & Demos
+
+
+```{toctree}
+:hidden:
+using-executorch-ios
+ios-backends
+ios-examples
diff --git a/docs/source/ios-xnnpack.md b/docs/source/ios-xnnpack.md
new file mode 100644
index 00000000000..315dd747006
--- /dev/null
+++ b/docs/source/ios-xnnpack.md
@@ -0,0 +1 @@
+```{include} backends-xnnpack.md
diff --git a/docs/source/ir-specification.md b/docs/source/ir-specification.md
new file mode 100644
index 00000000000..c58098ffc67
--- /dev/null
+++ b/docs/source/ir-specification.md
@@ -0,0 +1,8 @@
+# IR Specification
+
+```{toctree}
+:maxdepth: 1
+
+ir-exir
+ir-ops-set-definition
+```
diff --git a/docs/source/kernel-library-advanced.md b/docs/source/kernel-library-advanced.md
new file mode 100644
index 00000000000..5f0215b87c1
--- /dev/null
+++ b/docs/source/kernel-library-advanced.md
@@ -0,0 +1,23 @@
+(kernel-library-advanced)=
+
+# Kernel Library Deep Dive
+
+Advanced kernel implementation and customization for ExecuTorch.
+
+## Kernel Library Overview
+
+- {doc}`kernel-library-overview` — Architecture and design of the kernel library
+
+- {doc}`kernel-library-custom-aten-kernel` — Kernel registration and customization
+
+## Build Optimization
+
+- {doc}`kernel-library-selective-build` — Selective build for reduced binary footprint
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+kernel-library-overview
+kernel-library-custom-aten-kernel
+kernel-library-selective-build
diff --git a/docs/source/kernel-library-overview.md b/docs/source/kernel-library-overview.md
index cfd46524097..a826b334ba4 100644
--- a/docs/source/kernel-library-overview.md
+++ b/docs/source/kernel-library-overview.md
@@ -1,7 +1,7 @@
-This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
-
 # Overview of ExecuTorch’s Kernel Libraries
 
+This page provides a description of the Portable Kernel Library and the Optimized Kernel Library, which are the default kernel libraries shipped with ExecuTorch. It is recommended reading for those who are interested in executing ExecuTorch programs with these kernel libraries, or for those who want to implement their own kernels and kernel libraries.
+
 An ExecuTorch program encodes instructions that describe the computation that should be performed by the program. Many of these instructions will correspond to calling a specific ATen operator, for example `aten.convolution`. However, one of the core design principles of ExecuTorch is that the signature of an operator should be separate from the implementation of the operator. This means that the ExecuTorch runtime does not ship with any standard implementation for ATen operators; users must make sure to link against kernel libraries that contain implementations of the operators required by their ExecuTorch program, and configure [operator registration](kernel-library-custom-aten-kernel.md) to map an operator signature to the desired implementation. This makes it easy to adjust the implementation of operators such as `aten.convolution` that will be called when executing an ExecuTorch program; it allows users to select the exact operator implementations that will meet the unique performance, memory usage, battery usage, etc. constraints of their use-case.
 
 **In essence, a kernel library is simply a collection of ATen operator implementations that follow a common theme or design principle**. Note that due to ExecuTorch’s selective build process (discussed in the following section), operator implementations are linked individually. This means that users can easily mix different kernel libraries in their build without sacrificing build size.
diff --git a/docs/source/kernel-library-selective-build.md b/docs/source/kernel-library-selective-build.md
index 7d6495656a2..666206acb94 100644
--- a/docs/source/kernel-library-selective-build.md
+++ b/docs/source/kernel-library-selective-build.md
@@ -65,7 +65,7 @@ gen_selected_ops(
 )
 ```
 
-The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/BujSet/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction. 
+The macro makes a call to gen_oplist.py, which requires a [distinct selection](https://github.com/pytorch/executorch/blob/main/codegen/tools/gen_oplist.py#L222-L228) of API choice. `OPS_SCHEMA_YAML`, `ROOT_OPS`, `INCLUDE_ALL_OPS`, and `OPS_FROM_MODEL` are mutually exclusive options, and should not be used in conjunction.
 
 ### Select all ops
 
@@ -83,7 +83,7 @@ This API lets users pass in a list of operator names. Note that this API can be
 
 ### Select ops from model
 
-This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model. 
+This API lets users pass in a pte file of an exported model. When used, the pte file will be parsed to generate a yaml file that enumerates the operators and dtypes used in the model.
 
 ### Dtype Selective Build
 
@@ -91,7 +91,7 @@ Beyond pruning the binary to remove unused operators, the binary size can furthe
 
 ## Example Walkthrough
 
-In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L48-L72), we have the following cmake config options:
+In [examples/selective_build/CMakeLists.txt](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt), we have the following cmake config options:
 
 1. `EXECUTORCH_SELECT_OPS_YAML`
 2. `EXECUTORCH_SELECT_OPS_LIST`
@@ -99,10 +99,10 @@ In [examples/selective_build/CMakeLists.txt](https://github.com/BujSet/executorc
 4. `EXECUTORCH_SELECT_OPS_FROM_MODEL`
 5. `EXECUTORCH_DTYPE_SELECTIVE_BUILD`
 
-These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/BujSet/executorch/blob/main/examples/selective_build/CMakeLists.txt#L110-L123). The following table describes some examples of how the invocation changes when these configs are set:
+These options allow a user to tailor the cmake build process to utilize the different APIs, and results in different invocations on the `gen_selected_ops` [function](https://github.com/pytorch/executorch/blob/main/examples/selective_build/advanced/CMakeLists.txt). The following table describes some examples of how the invocation changes when these configs are set:
 
 | Example cmake Call | Resultant `gen_selected_ops` Invocation |
-| :----: | :---:| 
+| :----: | :---:|
 |<code><br>  cmake -D… -DSELECT_OPS_LIST="aten::add.out,aten::mm.out" <br></code> | <code><br>  gen_selected_ops("" "${SELECT_OPS_LIST}" "" "" "") <br></code> |
 |<code><br> cmake -D… -DSELECT_OPS_YAML=ON <br></code> | <code><br>  set(_custom_ops_yaml ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops.yaml) <br> gen_selected_ops("${_custom_ops_yaml}" "" "") <br></code> |
 |<code><br> cmake -D… -DEXECUTORCH_SELECT_OPS_FROM_MODEL="model.pte.out" <br></code> | <code><br> gen_selected_ops("" "" "" "${_model_path}" "") <br></code> |
diff --git a/docs/source/kernel-library.md b/docs/source/kernel-library.md
new file mode 100644
index 00000000000..a995a20973b
--- /dev/null
+++ b/docs/source/kernel-library.md
@@ -0,0 +1,9 @@
+# Kernel Library
+
+```{toctree}
+:maxdepth: 1
+
+kernel-library-overview
+kernel-library-custom-aten-kernel
+kernel-library-selective-build
+```
diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
index 4587589a51b..ae1b4f15c99 100644
--- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
+++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md
@@ -1,6 +1,7 @@
-# Building and Running Llama 3 8B Instruct with Qualcomm AI Engine Direct Backend
+# Run Llama 3 3B Instruct on Android (with Qualcomm AI Engine Direct Backend)
 
-This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Engine Direct Backend and running the model on a Qualcomm device.
+This tutorial demonstrates how to export and run the Llama 3 3B Instruct model on a Qualcomm device using the Qualcomm AI Engine Direct Backend via ExecuTorch.
+We use a static Llama [implementation](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/model/static_llama.py) to optimize performance and memory usage during on-device inference.
 
 ## Prerequisites
 
@@ -13,10 +14,8 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng
 
 ## Instructions
 
-### Step 1: Prepare the checkpoint of the model and optimized matrix from [Spin Quant](https://github.com/facebookresearch/SpinQuant)
-
-1. For Llama 3 tokenizer and checkpoint, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
-2. To get the optimized matrix, please refer to [SpinQuant on GitHub](https://github.com/facebookresearch/SpinQuant). You can download the optimized rotation matrices in the Quantized Models section. Please choose **LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0**.
+### Step 1: Prepare the checkpoint and tokenizer of the model.
+1. For Llama 3 tokenizer and checkpoint, please refer to [instructions](https://www.llama.com/models/llama-3) for further instructions on how to download `tokenizer.model`, `consolidated.00.pth` and `params.json`.
 
 ### Step 2: Export to ExecuTorch with Qualcomm AI Engine Direct Backend
 Deploying large language models like Llama 3 on-device presents the following challenges:
@@ -25,122 +24,79 @@ Deploying large language models like Llama 3 on-device presents the following ch
 2. High model loading and inference time.
 3. Difficulty in quantization.
 
-To address these challenges, we have implemented the following solutions:
-1. Using `quantization.pt2e_quantize = "qnn_16a4w'` to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
-2. Using `backed.qnn.num_sharding = 8` to shard the model into sub-parts.
-3. Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
-4. Using `backend.qnn.optimized_rotation_path = "<path_to_optimized_matrix>"` to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
-5. Using `quantization.calibration_data = "<|start_header_id|>system<|end_header_id|..."` to ensure that during quantization, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to [the model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/).
+To address these, we apply the following optimizations:
+
+1. Quantization: Use `QuantDtype.use_16a4w_block` for post-training quantization to reduce model size and memory usage.
+
+2. Mixed Precision Quantization: compresses KV cache tensors to 8-bit and applies `QuantDtype.use_16a8w` to the LM head.
+
+3. Model Sharding: Set `num_sharding` = 4 to shard the model into sub-parts. This helps reduce memory pressure and improve performance during on-device inference. The number of shards might be different depending on the model size.
+
+4. Graph Transformations: Convert operations into accelerator-friendly formats for better runtime performance.
+
+You can find the full optimization configuration in this [file](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/__init__.py), as shown below:
+
+``` python
+@register_llm_model("llama3_2-3b_instruct")
+@dataclass(init=False, frozen=True)
+class Llama3_2_3B_Instruct(LLMModelConfig):
+    repo_id = None
+    params_path = None
+    convert_weights = None
+    transform_weight = True
+    # The Llama3_2 enabled should be instruct, however, Llama's tokenizer does not provide utility to apply chat template.
+    instruct_model = False
+
+    num_sharding = 4
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 32  # Group size used in block quantization for weight quantization. Will only be used when ptq = 16a4w_block
+    masked_softmax = False
+  
+    # SeqMSE Quantization: optimizes the parameter encodings of each layer of a model individually to minimize the difference between the layer’s original and quantized outputs. (Implementation details: ./backends/qualcomm/_passes/seq_mse.py) In this configuration, we set `seq_mse_candidates` = 0, which means SeqMSE quantization is not applied.
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    custom_annotation = (
+        annotate_kv_8bit,
+        annotate_output_16a8w,
+    )
+```
+
 
 To export with the Qualcomm AI Engine Direct Backend, ensure the following:
 
-1. The host machine has more than 100GB of memory (RAM + swap space).
+1. The host machine has more than 64GB of memory (RAM + swap space).
 2. The entire process takes a few hours.
 
 ```bash
-# path/to/config.yaml
-base:
-  model_class: llama3
-  checkpoint: path/to/consolidated.00.pth
-  params: path/to/params.json
-  tokenizer_path: path/to/tokenizer.model
-  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
-model:
-  use_kv_cache: True
-  enable_dynamic_shape: False
-quantization:
-  pt2e_quantize: qnn_16a4w
-  # Please note that calibration_data must include the prompt template for special tokens.
-  calibration_data: "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-backend:
-  qnn:
-    enabled: True
-    num_sharding: 8
-    
-
-# export_llm
-python -m extension.llm.export.export_llm \
-  --config path/to/config.yaml
+# export llama
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --compile_only
 ```
+Note: end-to-end [instructions](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md)
 
 ### Step 3: Invoke the Runtime on an Android smartphone with Qualcomm SoCs
-1. Build executorch with Qualcomm AI Engine Direct Backend for android
-    ```bash
-    cmake \
-        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake" \
-        -DANDROID_ABI=arm64-v8a \
-        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-        -DEXECUTORCH_BUILD_QNN=ON \
-        -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-android-out .
-
-    cmake --build cmake-android-out -j16 --target install --config Release
-    ```
-2. Build llama runner for android
-```bash
-    cmake \
-        -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_ROOT}"/build/cmake/android.toolchain.cmake  \
-        -DANDROID_ABI=arm64-v8a \
-        -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-        -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
-        -DEXECUTORCH_BUILD_QNN=ON \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-        -Bcmake-android-out/examples/models/llama examples/models/llama
-
-    cmake --build cmake-android-out/examples/models/llama -j16 --config Release
-```
-3. Run on Android via adb shell
-*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone
-
 **3.1 Connect your android phone**
 
-**3.2 We need to push required QNN libraries to the device.**
-```bash
-# make sure you have write-permission on below path.
-DEVICE_DIR=/data/local/tmp/llama
-adb shell mkdir -p ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtp.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnSystem.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV69Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV73Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/aarch64-android/libQnnHtpV75Stub.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${DEVICE_DIR}
-adb push ${QNN_SDK_ROOT}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${DEVICE_DIR}
-```
-
-**3.3 Upload model, tokenizer and llama runner binary to phone**
-```bash
-adb push <model.pte> ${DEVICE_DIR}
-adb push <tokenizer.model> ${DEVICE_DIR}
-adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
-adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR}
-```
+**3.2 Make sure the following artifact is present before running the model.**
+-- artifact/
+   └── llama_qnn.pte
 
-**3.4 Run model**
+**3.3 Run model**
 ```bash
-adb shell "cd ${DEVICE_DIR} && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.model> --prompt \"<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n\" --seq_len 128"
-```
-You should see the message:
-```
-<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello! I'd be delighted to chat with you about Facebook. Facebook is a social media platform that was created in 2004 by Mark Zuckerberg and his colleagues while he was a student at Harvard University. It was initially called "Facemaker" but later changed to Facebook, which is a combination of the words "face" and "book". The platform was initially intended for people to share their thoughts and share information with their friends, but it quickly grew to become one of the
+# Run llama
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1 --pre_gen_pte ${PATH_TO_ARTIFACT}
 ```
 
 ## What is coming?
 - Performance improvements
 - Reduce the memory pressure during inference to support 12GB Qualcomm devices
-- Support more LLMs (Qwen, Phi-4-mini, etc.)
+- Broader LLM Support via [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch?tab=readme-ov-file#llms-large-language-models)
+
+  - Already supported models (e.g.): Llama2, Llama3, Gemma, Qwen, Phi-4, SmolLM. For usage examples, please refer to [README](https://github.com/pytorch/executorch/blob/main/examples/qualcomm/oss_scripts/llama/README.md)
 
 ## FAQ
 
 If you encounter any issues while reproducing the tutorial, please file a github
-issue on ExecuTorch repo and tag use `#qcom_aisw` tag
+[issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
\ No newline at end of file
diff --git a/docs/source/llm/export-custom-llm.md b/docs/source/llm/export-custom-llm.md
index 57537ba31d8..4797f773fa3 100644
--- a/docs/source/llm/export-custom-llm.md
+++ b/docs/source/llm/export-custom-llm.md
@@ -81,7 +81,7 @@ with open("nanogpt.pte", "wb") as file:
 
 To export, run the script with `python export_nanogpt.py` (or python3, as appropriate for your environment). It will generate a `nanogpt.pte` file in the current directory.
 
-For more information, see [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) and
+For more information, see [Exporting to ExecuTorch](../tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> and
 [torch.export](https://pytorch.org/docs/stable/export.html).
 
 ## Backend delegation
@@ -143,7 +143,7 @@ example_inputs = (
 # long as they adhere to the rules specified in the dynamic shape configuration.
 # Here we set the range of 0th model input's 1st dimension as
 # [0, model.config.block_size].
-# See https://pytorch.org/executorch/main/concepts.html#dynamic-shapes
+# See ../concepts.html#dynamic-shapes
 # for details about creating dynamic shapes.
 dynamic_shape = (
     {1: torch.export.Dim("token_dim", max=model.config.block_size - 1)},
diff --git a/docs/source/llm/export-llm.md b/docs/source/llm/export-llm.md
index 462d9a51849..082b8c2b18d 100644
--- a/docs/source/llm/export-llm.md
+++ b/docs/source/llm/export-llm.md
@@ -4,7 +4,7 @@ Instead of needing to manually write code to call torch.export(), use ExecuTorch
 
 ## Prerequisites
 
-The LLM export functionality requires the `pytorch_tokenizers` package. If you encounter a `ModuleNotFoundError: No module named 'pytorch_tokenizers'` error, install it from the ExecutorTorch source code:
+The LLM export functionality requires the `pytorch_tokenizers` package. If you encounter a `ModuleNotFoundError: No module named 'pytorch_tokenizers'` error, install it from the ExecuTorch source code:
 
 ```bash
 pip install -e ./extension/llm/tokenizers/
@@ -78,7 +78,7 @@ python -m extension.llm.export.export_llm \
 - `use_shared_embedding` can help for models with tied input/output embedding layers, given that you quantize using TorchAO low bit ops (`quantization.qmode: torchao:8da(\\d+)w` or `quantization.qmode: torchao:fpa(\d+)w`), see more [here](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L307).
 - `use_attention_sink` to extend generation by removing from the beginning of the KV cache when the max context length is reached.
 - `quantize_kv_cache` quantizes the KV cache in int8.
-- `local_global_attention` impements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache.
+- `local_global_attention` implements [Local-Global Attention](https://arxiv.org/abs/2411.09604), making specific attention layers use a much smaller localized sliding window KV cache.
 
 ## Quantization
 Quantization options are defined by [`QuantizationConfig`](https://github.com/pytorch/executorch/blob/main/extension/llm/export/config/llm_config.py#L283). ExecuTorch does quantization in two ways:
@@ -92,7 +92,7 @@ The quantization modes are defined [here](https://github.com/pytorch/executorch/
 
 Common ones to use are:
 - `8da4w`: short for int8 dynamic activation + int4 weight quantization.
-- `int8`: int8 weight-only quanziation.
+- `int8`: int8 weight-only quantization.
 
 Group size is specified with:
 - `group_size`: 8, 32, 64, etc.
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
index 849418342b6..6b6f9d96df7 100644
--- a/docs/source/llm/getting-started.md
+++ b/docs/source/llm/getting-started.md
@@ -21,6 +21,6 @@ Deploying LLMs to ExecuTorch can be boiled down to a two-step process: (1) expor
 - [Exporting LLMs](export-llm.md)
 - [Exporting custom LLMs](export-custom-llm.md)
 - [Running with C++](run-with-c-plus-plus.md)
-- [Running on Android (XNNPack)](llama-demo-android.md)
+- [Running on Android (XNNPack)](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android)
 - [Running on Android (Qualcomm)](build-run-llama3-qualcomm-ai-engine-direct-backend.md)
 - [Running on iOS](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple)
diff --git a/docs/source/llm/llama-demo-android.md b/docs/source/llm/llama-demo-android.md
deleted file mode 100644
index 023f82baf33..00000000000
--- a/docs/source/llm/llama-demo-android.md
+++ /dev/null
@@ -1,2 +0,0 @@
-```{include} ../../../examples/demo-apps/android/LlamaDemo/README.md
-```
diff --git a/docs/source/llm/run-with-c-plus-plus.md b/docs/source/llm/run-with-c-plus-plus.md
index f987fcab2a5..217afad847b 100644
--- a/docs/source/llm/run-with-c-plus-plus.md
+++ b/docs/source/llm/run-with-c-plus-plus.md
@@ -10,7 +10,7 @@ Before you begin, make sure you have:
    - Please also see [Model Metadata](#model-metadata) section for important metadata to be serialized into `.pte`.
 2. A tokenizer file compatible with your model
    - For HuggingFace tokenizers, this is a JSON file `tokenizer.json`
-   - For SentencePiece tokenizers, this is is a `tokenizer.model` file and normally live alongside the weights file
+   - For SentencePiece tokenizers, this is a `tokenizer.model` file and normally lives alongside the weights file
 3. CMake and a C++ compiler installed
    - CMake version 3.29 or higher
    - g++ or clang compiler
diff --git a/docs/source/llm/working-with-llms.md b/docs/source/llm/working-with-llms.md
new file mode 100644
index 00000000000..4c238f7ae5c
--- /dev/null
+++ b/docs/source/llm/working-with-llms.md
@@ -0,0 +1,18 @@
+(working-with-llms)=
+
+# LLMs
+
+Learn how to export LLM models and deploy them across different platforms and runtime environments. This section covers the complete workflow from model export to running inference on mobile devices and edge hardware.
+
+
+```{toctree}
+:maxdepth: 1
+:caption: Working with LLMs
+
+getting-started
+export-llm
+export-custom-llm
+run-with-c-plus-plus
+build-run-llama3-qualcomm-ai-engine-direct-backend
+run-on-ios
+```
diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md
new file mode 100644
index 00000000000..acbdb06a6b6
--- /dev/null
+++ b/docs/source/platforms-desktop.md
@@ -0,0 +1,23 @@
+# Desktop & Laptop
+
+ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windows.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Platform
+
+### Linux
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
+- [ARM Ethos-U (ARM64)](backends-arm-ethos-u)
+
+### macOS
+- [CoreML (recommended)](backends-coreml)
+- [MPS (Apple Silicon)](backends-mps)
+- [XNNPACK (CPU)](backends-xnnpack)
+
+### Windows
+- [XNNPACK (CPU)](backends-xnnpack)
+- [OpenVINO (Intel)](build-run-openvino)
diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md
new file mode 100644
index 00000000000..5ea248fc0d9
--- /dev/null
+++ b/docs/source/platforms-embedded.md
@@ -0,0 +1,19 @@
+# Embedded Platforms
+
+ExecuTorch supports embedded devices from microcontrollers to edge devices.
+
+## Platform-Specific Guides
+- [C++ Runtime Integration](using-executorch-cpp) - Complete setup guide
+- [Building from Source](using-executorch-building-from-source)
+
+## Available Backends by Device Type
+
+### Microcontrollers
+- [Cadence Xtensa Backend](backends-cadence)
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [Custom Backend Development](backend-delegates-integration)
+
+### Edge Devices
+- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [NXP eIQ Neutron Backend](backend-nxp)
+- [Custom Hardware Integration](backend-delegates-integration)
diff --git a/docs/source/ptd-file-format.md b/docs/source/ptd-file-format.md
index 6381e8a071c..c7bad1f34c0 100644
--- a/docs/source/ptd-file-format.md
+++ b/docs/source/ptd-file-format.md
@@ -111,7 +111,7 @@ The flatbuffer-encoded metadata follows the headers and contains:
 ### Tensor Layout
 
 If a data segment contains a canonical tensor, it may have associated layout information:
-- **Scalar type**: Data type (float32, int32, etc.) using ExecutorTorch scalar types.
+- **Scalar type**: Data type (float32, int32, etc.) using ExecuTorch scalar types.
 - **Sizes**: Dimensions of the tensor.
 - **Dim order**: Memory layout order specifying how dimensions are arranged in memory.
 
diff --git a/docs/source/quantization-optimization.md b/docs/source/quantization-optimization.md
new file mode 100644
index 00000000000..d2005b3adac
--- /dev/null
+++ b/docs/source/quantization-optimization.md
@@ -0,0 +1,20 @@
+(quantization-optimization)=
+
+# Quantization & Optimization
+
+Advanced techniques for model compression and performance optimization.
+
+## Quantization Strategies
+
+- {doc}`quantization-overview` — Comprehensive quantization strategies and techniques
+
+## Performance Optimization
+
+- {doc}`runtime-profiling` — Performance profiling and optimization techniques
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+quantization-overview
+runtime-profiling
diff --git a/docs/source/quantization-overview.md b/docs/source/quantization-overview.md
index fdceee80e8e..4ff8d34a4a8 100644
--- a/docs/source/quantization-overview.md
+++ b/docs/source/quantization-overview.md
@@ -14,7 +14,7 @@ Quantization in ExecuTorch is backend-specific. Each backend defines how models
 The PT2E quantization workflow has three main steps:
 
 1. Configure a backend-specific quantizer.
-2. Prepare, calibrate, convert, and evalute the quantized model in PyTorch
+2. Prepare, calibrate, convert, and evaluate the quantized model in PyTorch
 3. Lower the model to the target backend
 
 ## 1. Configure a Backend-Specific Quantizer
diff --git a/docs/source/quantization.md b/docs/source/quantization.md
new file mode 100644
index 00000000000..b5ee9f21897
--- /dev/null
+++ b/docs/source/quantization.md
@@ -0,0 +1,7 @@
+# Quantization
+
+```{toctree}
+:maxdepth: 1
+
+quantization-overview
+```
diff --git a/docs/source/quick-start-section.md b/docs/source/quick-start-section.md
new file mode 100644
index 00000000000..b35bed8d22c
--- /dev/null
+++ b/docs/source/quick-start-section.md
@@ -0,0 +1,38 @@
+(quick-start-section)=
+# Quick Start
+
+Get started with ExecuTorch in just a few steps.
+
+This section walks you through the essential steps to get ExecuTorch up and running, from initial setup to exporting your first model for edge deployment.
+
+## What You'll Learn
+
+Follow these guides in order to get started with ExecuTorch:
+
+- **{doc}`getting-started`** - Initial Setup: Set up your development environment and run your first ExecuTorch example.
+
+- **{doc}`using-executorch-export`** - Exporting your model: Export for Edge deployment.
+
+- **{doc}`using-executorch-building-from-source`** - Building from Source: Build ExecuTorch from source for custom configurations and development.
+
+## Prerequisites
+
+- Python 3.10-3.12
+- PyTorch 2.9+
+- Basic familiarity with PyTorch model development
+
+## Next Steps
+
+After completing the quick start, explore:
+
+- **{doc}`edge-platforms-section`** - Deploy to specific platforms (Android, iOS, Desktop, Embedded)
+- **{doc}`backends-section`** - Choose the right acceleration backend for your hardware
+
+```{toctree}
+:hidden:
+:maxdepth: 2
+:caption: Quick Start Guide
+
+getting-started
+using-executorch-export
+using-executorch-building-from-source
diff --git a/docs/source/running-a-model-cpp-tutorial.md b/docs/source/running-a-model-cpp-tutorial.md
index a12ef122bc8..5ae4235995d 100644
--- a/docs/source/running-a-model-cpp-tutorial.md
+++ b/docs/source/running-a-model-cpp-tutorial.md
@@ -6,13 +6,13 @@ In this tutorial, we will cover how to run an ExecuTorch model in C++ using the
 
 For a high level overview of the ExecuTorch Runtime please see [Runtime Overview](runtime-overview.md), and for more in-depth documentation on
 each API please see the [Runtime API Reference](executorch-runtime-api-reference.rst).
-[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.md) doc shows how to build and run it.
+[Here](https://github.com/pytorch/executorch/blob/main/examples/portable/executor_runner/executor_runner.cpp) is a fully functional version C++ model runner, and the [Setting up ExecuTorch](getting-started-setup.rst) doc shows how to build and run it.
 
 
 ## Prerequisites
 
 You will need an ExecuTorch model to follow along. We will be using
-the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial).
+the model `SimpleConv` generated from the [Exporting to ExecuTorch tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->.
 
 ## Model Loading
 
@@ -96,7 +96,7 @@ MemoryManager memory_manager(&method_allocator, &planned_memory);
 
 ## Loading a Method
 
-In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to intializing delegates, etc.
+In ExecuTorch we load and initialize from the `Program` at a method granularity. Many programs will only have one method 'forward'. `load_method` is where initialization is done, from setting up tensor metadata, to initializing delegates, etc.
 
 ``` cpp
 Result<Method> method = program->load_method(method_name);
diff --git a/docs/source/runtime-integration-advanced.md b/docs/source/runtime-integration-advanced.md
new file mode 100644
index 00000000000..a76265c4093
--- /dev/null
+++ b/docs/source/runtime-integration-advanced.md
@@ -0,0 +1,20 @@
+(runtime-integration-advanced)=
+
+# Runtime & Integration
+
+Advanced runtime integration topics
+
+## Platform Integration
+
+- {doc}`runtime-platform-abstraction-layer` — Platform abstraction layer for cross-platform deployment
+
+## Portable C++ Programming
+
+- {doc}`portable-cpp-programming` — Portable C++ programming for cross-platform deployment
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+
+runtime-platform-abstraction-layer
+portable-cpp-programming
diff --git a/docs/source/runtime-overview.md b/docs/source/runtime-overview.md
index 96a618a2a41..1df3da40478 100644
--- a/docs/source/runtime-overview.md
+++ b/docs/source/runtime-overview.md
@@ -11,7 +11,7 @@ Works](intro-how-it-works.md).
 At the highest level, the ExecuTorch runtime is responsible for:
 
 * Loading binary `.pte` program files that were generated by the
-  [`to_executorch()`](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial) step of the
+  [`to_executorch()`](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore --> step of the
   model-lowering process.
 * Executing the series of instructions that implement a lowered model.
 
diff --git a/docs/source/runtime-profiling.md b/docs/source/runtime-profiling.md
index 120d31954fd..56b62de599d 100644
--- a/docs/source/runtime-profiling.md
+++ b/docs/source/runtime-profiling.md
@@ -20,4 +20,4 @@ We provide access to all the profiling data via the Python [Inspector API](model
     - Through the Inspector API, users can do a wide range of analysis varying from printing out performance details to doing more finer granular calculation on module level.
 
 
-Please refer to the [Developer Tools tutorial](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for a step-by-step walkthrough of the above process on a sample model.
+Please refer to the [Developer Tools tutorial](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for a step-by-step walkthrough of the above process on a sample model.
diff --git a/docs/source/runtime.md b/docs/source/runtime.md
new file mode 100644
index 00000000000..1d96cc53188
--- /dev/null
+++ b/docs/source/runtime.md
@@ -0,0 +1,15 @@
+# Runtime
+
+```{toctree}
+:maxdepth: 1
+
+runtime-overview
+extension-module
+extension-tensor
+running-a-model-cpp-tutorial
+runtime-backend-delegate-implementation-and-linking
+runtime-platform-abstraction-layer
+portable-cpp-programming
+pte-file-format
+ptd-file-format
+```
diff --git a/docs/source/success-stories.md b/docs/source/success-stories.md
new file mode 100644
index 00000000000..cba874132c6
--- /dev/null
+++ b/docs/source/success-stories.md
@@ -0,0 +1,56 @@
+(success-stories)=
+
+# Success Stories
+
+Discover how organizations are leveraging ExecuTorch to deploy AI models at scale on edge devices.
+
+---
+
+## 🎯 Featured Success Stories
+
+::::{grid} 1
+:gutter: 3
+
+:::{grid-item-card} **🚀 Story 1: [Title Placeholder]**
+:class-header: bg-primary text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-1-details)
+:::
+
+:::{grid-item-card} **⚡ Story 2: [Title Placeholder]**
+:class-header: bg-success text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+
+[Read Full Story →](#story-2-details)
+:::
+
+:::{grid-item-card} **🧠 Story 3: [Title Placeholder]**
+:class-header: bg-info text-white
+
+**Industry:** [Industry]
+**Hardware:** [Hardware Platform]
+**Impact:** [Key Metrics]
+
+[Placeholder Description] - Brief overview of the challenge, solution, and results achieved.
+
+
+[Read Full Story →](#story-3-details)
+:::
+
+::::
+
+---
diff --git a/docs/source/support-section.md b/docs/source/support-section.md
new file mode 100644
index 00000000000..64c47a3e55b
--- /dev/null
+++ b/docs/source/support-section.md
@@ -0,0 +1,17 @@
+(support-section)=
+# Support
+
+In this section, find answers to common questions, troubleshooting guides, and information on how to contribute to the ExecuTorch project. Get help with issues and learn how to participate in the community.
+
+- {doc}`using-executorch-faqs` — FAQ
+- {doc}`using-executorch-troubleshooting` — Common Issues
+- {doc}`contributing` — Contributing
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Support
+
+using-executorch-faqs
+using-executorch-troubleshooting
+contributing
diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md
new file mode 100644
index 00000000000..461a1f6849a
--- /dev/null
+++ b/docs/source/tools-section.md
@@ -0,0 +1,30 @@
+(tools-sdk-section)=
+
+# Tools
+
+In this section, explore ExecuTorch's comprehensive developer tools for profiling, debugging, and model inspection. These tools help optimize performance and troubleshoot issues during development and deployment.
+
+- {doc}`devtools-overview` — Developer Tools Overview
+- {doc}`bundled-io` — Bundled I/O
+- {doc}`etrecord` — ETRecord
+- {doc}`etdump` — ETDump
+- {doc}`runtime-profiling` — Profiling Suite
+- {doc}`model-debugging` — Debugging Tools
+- {doc}`model-inspector` — Model Inspector
+- {doc}`memory-planning-inspection` — Memory Planning Inspection
+- {doc}`devtools-tutorial` — Development Utilities
+
+```{toctree}
+:hidden:
+:maxdepth: 1
+:caption: Tools
+
+devtools-overview
+bundled-io
+etrecord
+etdump
+runtime-profiling
+model-debugging
+model-inspector
+memory-planning-inspection
+devtools-tutorial
diff --git a/docs/source/tutorial-arm-ethos-u.md b/docs/source/tutorial-arm-ethos-u.md
new file mode 100644
index 00000000000..0c713e996f8
--- /dev/null
+++ b/docs/source/tutorial-arm-ethos-u.md
@@ -0,0 +1,214 @@
+# Arm Ethos-U NPU Backend Tutorial
+
+<!----This will show a grid card on the page----->
+::::{grid} 2
+
+:::{grid-item-card}  Tutorials we recommend you complete before this:
+:class-card: card-prerequisites
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
+:::
+
+:::{grid-item-card}  What you will learn in this tutorial:
+:class-card: card-prerequisites
+In this tutorial you will learn how to export a simple PyTorch model for the ExecuTorch Ethos-U backend.
+:::
+
+::::
+
+```{tip}
+If you are already familiar with this delegate, you may want to jump directly to the examples:
+* [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
+* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py)
+```
+
+This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on Arm&reg; Ethos&trade;-U targets. It is based on `ethos_u_minimal_example.ipynb`, provided in Arm’s examples folder.
+
+## Prerequisites
+
+### Hardware
+
+To successfully complete this tutorial, you will need a Linux machine with aarch64 or x86_64 processor architecture, or a macOS&trade; machine with Apple&reg; Silicon.
+
+To enable development without a specific development board, we will be using a [Fixed Virtual Platform (FVP)](https://www.arm.com/products/development-tools/simulation/fixed-virtual-platforms), simulating [Arm&reg; Corstone&trade;-300](https://developer.arm.com/Processors/Corstone-300)(cs300) and [Arm&reg; Corstone&trade;-300](https://developer.arm.com/Processors/Corstone-320)(cs320)systems. Think of it as virtual hardware.
+
+### Software
+
+First, you will need to install ExecuTorch. Please follow the recommended tutorials to set up a working ExecuTorch development environment.
+
+In addition to this, you need to install a number of SDK dependencies for generating Ethos-U command streams. Scripts to automate this are available in the main [ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm/).
+To install Ethos-U dependencies, run
+```bash
+./examples/arm/setup.sh --i-agree-to-the-contained-eula
+```
+This will install:
+- [TOSA Serialization Library](https://www.mlplatform.org/tosa/software.html) for serializing the Exir IR graph into TOSA IR.
+- [Ethos-U Vela graph compiler](https://pypi.org/project/ethos-u-vela/) for compiling TOSA flatbuffers into a Ethos-U command stream.
+- [Arm GNU Toolchain](https://developer.arm.com/Tools%20and%20Software/GNU%20Toolchain) for cross compilation.
+- [Corstone SSE-300 FVP](https://developer.arm.com/documentation/100966/1128/Arm--Corstone-SSE-300-FVP) for testing on Ethos-U55 reference design.
+- [Corstone SSE-320 FVP](https://developer.arm.com/documentation/109760/0000/SSE-320-FVP) for testing on Ethos-U85 reference design.
+
+## Set Up the Developer Environment
+
+The setup.sh script generates a setup_path.sh script that you need to source whenever you restart your shell. Run:
+
+```{bash}
+source  examples/arm/ethos-u-scratch/setup_path.sh
+```
+
+As a simple check that your environment is set up correctly, run `which FVP_Corstone_SSE-320` and make sure that the executable is located where you expect, in the `examples/arm` tree.
+
+## Build
+
+### Ahead-of-Time (AOT) components
+
+The ExecuTorch Ahead-of-Time (AOT) pipeline takes a PyTorch Model (a `torch.nn.Module`) and produces a `.pte` binary file, which is then consumed by the ExecuTorch Runtime. This [document](getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime.
+
+The example below shows how to quantize a model consisting of a single addition, and export it it through the AOT flow using the EthosU backend. For more details, see `examples/arm/ethos_u_minimal_example.ipynb`.
+
+```python
+import torch
+
+class Add(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return x + y
+
+example_inputs = (torch.ones(1,1,1,1),torch.ones(1,1,1,1))
+
+model = Add()
+model = model.eval()
+exported_program = torch.export.export(model, example_inputs)
+graph_module = exported_program.graph_module
+
+
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import (
+    EthosUQuantizer,
+    get_symmetric_quantization_config,
+)
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+# Create a compilation spec describing the target for configuring the quantizer
+# Some args are used by the Arm Vela graph compiler later in the example. Refer to Arm Vela documentation for an
+# explanation of its flags: https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/blob/main/OPTIONS.md
+compile_spec = EthosUCompileSpec(
+            target="ethos-u55-128",
+            system_config="Ethos_U55_High_End_Embedded",
+            memory_mode="Shared_Sram",
+            extra_flags=["--output-format=raw", "--debug-force-regor"]
+        )
+
+# Create and configure quantizer to use a symmetric quantization config globally on all nodes
+quantizer = EthosUQuantizer(compile_spec)
+operator_config = get_symmetric_quantization_config()
+quantizer.set_global(operator_config)
+
+# Post training quantization
+quantized_graph_module = prepare_pt2e(graph_module, quantizer)
+quantized_graph_module(*example_inputs) # Calibrate the graph module with the example input
+quantized_graph_module = convert_pt2e(quantized_graph_module)
+
+
+# Create a new exported program using the quantized_graph_module
+quantized_exported_program = torch.export.export(quantized_graph_module, example_inputs)
+from executorch.backends.arm.ethosu import EthosUPartitioner
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from executorch.extension.export_util.utils import save_pte_program
+
+# Create partitioner from compile spec
+partitioner = EthosUPartitioner(compile_spec)
+
+# Lower the exported program to the Ethos-U backend
+edge_program_manager = to_edge_transform_and_lower(
+            quantized_exported_program,
+            partitioner=[partitioner],
+            compile_config=EdgeCompileConfig(
+                _check_ir_validity=False,
+            ),
+        )
+
+# Convert edge program to executorch
+executorch_program_manager = edge_program_manager.to_executorch(
+            config=ExecutorchBackendConfig(extract_delegate_segments=False)
+        )
+
+
+# Save pte file
+save_pte_program(executorch_program_manager, "ethos_u_minimal_example.pte")
+```
+
+
+```{tip}
+For a quick start, you can use the script `examples/arm/aot_arm_compiler.py` to produce the pte.
+To produce a pte file equivalent to the one above, run
+`python -m examples.arm.aot_arm_compiler --model_name=add --delegate --quantize --output=ethos_u_minimal_example.pte`
+```
+
+### Runtime:
+
+After the AOT compilation flow is done, the runtime can be cross compiled and linked to the produced `.pte`-file using the Arm cross-compilation toolchain. This is done in two steps:
+
+First, build and install the ExecuTorch libraries and EthosUDelegate:
+```
+# In ExecuTorch top-level, with sourced setup_path.sh
+cmake -DCMAKE_BUILD_TYPE=Release --preset arm-baremetal -B cmake-out-arm .
+cmake --build cmake-out-arm --target install -j$(nproc)
+```
+Second, build and link the `arm_executor_runner` and generate kernel bindings for any non delegated ops. This is the actual program that will run on target.
+
+```
+# In ExecuTorch top-level, with sourced setup_path.sh
+cmake -DCMAKE_TOOLCHAIN_FILE=`pwd`/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DET_PTE_FILE_PATH=ethos_u_minimal_example.pte \
+      -DTARGET_CPU=cortex-m55 \
+      -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \
+      -DMEMORY_MODE=Shared_Sram \
+      -DSYSTEM_CONFIG=Ethos_U55_High_End_Embedded \
+      -Bethos_u_minimal_example \
+      examples/arm/executor_runner
+cmake --build ethos_u_minimal_example -j$(nproc) -- arm_executor_runner
+```
+
+```{tip}
+For a quick start, you can use the script `backends/arm/scripts/build_executor_runner.sh` to build the runner.
+To build a runner equivalent to the one above, run
+`./backends/arm/scripts/build_executor_runner.sh --pte=ethos_u_minimal_example.pte`
+````
+
+The block diagram below shows, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable.
+
+![](arm-delegate-runtime-build.svg)
+
+
+
+## Running on Corstone FVP Platforms
+
+Finally, use the `backends/arm/scripts/run_fvp.sh` utility script to run the .elf-file on simulated Arm hardware.
+```
+backends/arm/scripts/run_fvp.sh --elf=$(find ethos_u_minimal_example -name arm_executor_runner) --target=ethos-u55-128
+```
+The example application is by default built with an input of ones, so the expected result of the quantized addition should be close to 2.
+
+
+## Takeaways
+
+In this tutorial you have learned how to use ExecuTorch to export a PyTorch model to an executable that can run on an embedded target, and then run that executable on simulated hardware.
+To learn more, check out these learning paths:
+
+https://learn.arm.com/learning-paths/embedded-and-microcontrollers/rpi-llama3/
+https://learn.arm.com/learning-paths/embedded-and-microcontrollers/visualizing-ethos-u-performance/
+
+## FAQs
+
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
+
+
+```
+Arm is a registered trademark of Arm Limited (or its subsidiaries or affiliates).
+```
diff --git a/docs/source/tutorial-arm-vgf.md b/docs/source/tutorial-arm-vgf.md
new file mode 100644
index 00000000000..0e34e4be4b6
--- /dev/null
+++ b/docs/source/tutorial-arm-vgf.md
@@ -0,0 +1,221 @@
+# Arm VGF Backend Tutorial
+
+<!----This will show a grid card on the page----->
+::::{grid} 2
+
+:::{grid-item-card}  Tutorials we recommend you complete before this:
+:class-card: card-prerequisites
+* [Introduction to ExecuTorch](intro-how-it-works.md)
+* [Getting Started](getting-started.md)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
+:::
+
+:::{grid-item-card}  What you will learn in this tutorial:
+:class-card: card-prerequisites
+In this tutorial you will learn how to export a simple PyTorch model for the ExecuTorch VGF backend.
+:::
+
+::::
+
+```{warning}
+This delegate is under active development, to get best results please use a recent version.
+The VGF backend support is in early development and you may encounter issues.
+You may encounter some rough edges and features which may be documented or planned but not implemented, please refer to the in-tree documentation for the latest status of features.
+```
+
+```{tip}
+If you are already familiar with this delegate, you may want to jump directly to the examples:
+* [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
+* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py)
+```
+
+This tutorial serves as an introduction to using ExecuTorch to deploy PyTorch models on VGF targets. The tutorial is based on `vgf_minimal_example.ipyb`, provided in Arm&reg;'s example folder.
+
+## Prerequisites
+
+### Hardware
+
+To successfully complete this tutorial, you will need a Linux machine with aarch64 or x86_64 processor architecture, or a macOS&trade; machine with Apple&reg; Silicon.
+
+To enable development without a specific development board, we will be using the [ML SDK for Vulkan&reg;](https://github.com/arm/ai-ml-sdk-for-vulkan/) to emulate the program consumer.
+
+### Software
+
+First, you will need to install ExecuTorch. Please follow the recommended tutorials if you haven't already, to set up a working ExecuTorch development environment. For the VGF backend it's recommended you [install from source](https://docs.pytorch.org/executorch/stable/using-executorch-building-from-source.html), or from a [nightly](https://download.pytorch.org/whl/nightly/executorch/).
+
+Additionally, you need to install a number of SDK dependencies for generating VGF files. For glslc, prefer installing it via your package manager. If this is not possible, and for other dependencies, there are scripts to automate installation available in the main [ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm/). glscl will then be installed via the Vulkan SDK.
+
+To install VGF dependencies, run
+```bash
+./examples/arm/setup.sh --i-agree-to-the-contained-eula --disable-ethos-u-deps --enable-mlsdk-deps
+```
+This will install:
+- [TOSA Serialization Library](https://www.mlplatform.org/tosa/software.html) for serializing the Exir IR graph into TOSA IR.
+- [ML SDK Model Converter](https://github.com/arm/ai-ml-sdk-model-converter) for converting TOSA flatbuffers to VGF files.
+- [Vulkan API (If needed)](https://www.vulkan.org) Should be set up locally for GPU execution support.
+- [ML Emulation Layer for Vulkan](https://github.com/arm/ai-ml-emulation-layer-for-vulkan) for testing on Vulkan API.
+
+
+## Set Up the Developer Environment
+
+The `setup.sh` script has generated a `setup_path.sh` script that you need to source whenever you restart your shell. Do this by running
+
+`source examples/arm/ethos-u-scratch/setup_path.sh`
+
+As a simple check that your environment is set up correctly, run
+
+```bash
+which model-converter
+```
+Make sure the executable is located where you expect, in the `examples/arm` tree.
+
+## Build
+
+### Ahead-of-Time (AOT) components
+
+The ExecuTorch Ahead-of-Time (AOT) pipeline takes a PyTorch Model (a `torch.nn.Module`) and produces a `.pte` binary file, which is then typically consumed by the ExecuTorch Runtime. This [document](getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime.
+
+The example below shows how to quantize a model consisting of a single addition, and export it it through the AOT flow using the VGF backend. For more details, se `examples/arm/vgf_minimal_example.ipynb`.
+
+```python
+import torch
+
+class Add(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return x + y
+
+example_inputs = (torch.ones(1,1,1,1),torch.ones(1,1,1,1))
+
+model = Add()
+model = model.eval()
+exported_program = torch.export.export_for_training(model, example_inputs)
+graph_module = exported_program.graph_module
+
+
+from executorch.backends.arm.vgf import VgfCompileSpec
+from executorch.backends.arm.quantizer import (
+    VgfQuantizer,
+    get_symmetric_quantization_config,
+)
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+# Create a compilation spec describing the target for configuring the quantizer
+compile_spec = VgfCompileSpec("TOSA-1.0+INT")
+
+# Create and configure quantizer to use a symmetric quantization config globally on all nodes
+quantizer = VgfQuantizer(compile_spec)
+operator_config = get_symmetric_quantization_config(is_per_channel=False)
+quantizer.set_global(operator_config)
+
+# Post training quantization
+quantized_graph_module = prepare_pt2e(graph_module, quantizer)
+quantized_graph_module(*example_inputs) # Calibrate the graph module with the example input
+quantized_graph_module = convert_pt2e(quantized_graph_module)
+
+
+# Create a new exported program using the quantized_graph_module
+quantized_exported_program = torch.export.export(quantized_graph_module, example_inputs)
+import os
+from executorch.backends.arm.vgf import VgfPartitioner
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from executorch.extension.export_util.utils import save_pte_program
+
+# Create partitioner from compile spec
+partitioner = VgfPartitioner(compile_spec)
+
+# Lower the exported program to the VGF backend
+edge_program_manager = to_edge_transform_and_lower(
+            quantized_exported_program,
+            partitioner=[partitioner],
+            compile_config=EdgeCompileConfig(
+                _check_ir_validity=False,
+            ),
+)
+
+# Convert edge program to executorch
+executorch_program_manager = edge_program_manager.to_executorch(
+            config=ExecutorchBackendConfig(extract_delegate_segments=False)
+)
+
+
+# Save pte file
+cwd_dir = os.getcwd()
+pte_base_name = "simple_example"
+pte_name = pte_base_name + ".pte"
+pte_path = os.path.join(cwd_dir, pte_name)
+save_pte_program(executorch_program_manager, pte_name)
+assert os.path.exists(pte_path), "Build failed; no .pte-file found"
+```
+
+
+```{tip}
+For a quick start, you can use the script `examples/arm/aot_arm_compiler.py` to produce the pte.
+To produce a pte file equivalent to the one above, run
+`python -m examples.arm.aot_arm_compiler --model_name=add --delegate --quantize --output=simple_example.pte --target=vgf`
+```
+
+### Runtime:
+
+## Build executor runtime
+
+After the AOT compilation flow is done, we can build the executor runner target. For this tutorial, the default runner can be used. Build it with the following configuration:
+
+```bash
+# In ExecuTorch top-level, with sourced setup_path.sh
+cmake \
+  -DCMAKE_INSTALL_PREFIX=cmake-out \
+  -DCMAKE_BUILD_TYPE=Debug \
+  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+  -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+  -DEXECUTORCH_BUILD_XNNPACK=OFF \
+  -DEXECUTORCH_BUILD_VULKAN=ON \
+  -DEXECUTORCH_BUILD_VGF=ON \
+  -DEXECUTORCH_ENABLE_LOGGING=ON \
+  -DPYTHON_EXECUTABLE=python \
+  -Bcmake-out .
+
+cmake --build cmake-out --target executor_runner`
+```
+
+
+The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable.
+
+![](arm-delegate-runtime-build.svg)
+
+
+## Deploying and running on device
+
+Since we are using the Vulkan emulation layer, we can run the executor runner with the VGF delegate on the host machine:
+
+```bash
+./cmake-out/executor_runner -model_path simple_example.pte
+```
+
+The example application is by default built with an input of ones, so the expected result of the quantized addition should be close to 2.
+
+## Takeaways
+
+In this tutorial you have learned how to use ExecuTorch to export a PyTorch model to an executable that can run on an embedded target, and then run that executable on simulated hardware.
+
+
+## FAQs
+
+*glslc is not found when configuring the executor runner*.
+
+The Vulkan sdk is likely not in your path, check whether setup_path.sh contains something like
+`export PATH=$(pwd)/examples/arm/ethos-u-scratch/vulkan_sdk/1.4.321.1/x86_64/bin:$PATH`.
+If not, add it and source the file.
+
+If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
+
+```
+Arm is a registered trademark of Arm Limited (or its subsidiaries or affiliates).
+```
\ No newline at end of file
diff --git a/docs/source/tutorial-arm.md b/docs/source/tutorial-arm.md
deleted file mode 100644
index 0692b631154..00000000000
--- a/docs/source/tutorial-arm.md
+++ /dev/null
@@ -1,467 +0,0 @@
-# Arm&reg; Backend Tutorial
-
-<!----This will show a grid card on the page----->
-::::{grid} 2
-
-:::{grid-item-card}  Tutorials we recommend you complete before this:
-:class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Getting Started](getting-started.md)
-* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
-:::
-
-:::{grid-item-card}  What you will learn in this tutorial:
-:class-card: card-prerequisites
-In this tutorial you will learn how to export a simple PyTorch model for ExecuTorch Arm backends.
-:::
-
-::::
-
-```{warning}
-This delegate is under active development, to get best results please use a recent version.
-The TOSA and Ethos(tm) backend support is reasonably mature and used in production by some users.
-The VGF backend support is in early development and you may encounter issues.
-You may encounter some rough edges and features which may be documented or planned but not implemented, please refer to the in-tree documentation for the latest status of features.
-```
-
-```{tip}
-If you are already familiar with this delegate, you may want to jump directly to the examples:
-* [Examples in the ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm)
-* [Compilation for Ethos-U](https://github.com/pytorch/executorch/blob/main/examples/arm/ethos_u_minimal_example.ipynb)
-* [A commandline compiler for example models](https://github.com/pytorch/executorch/blob/main/examples/arm/aot_arm_compiler.py)
-```
-
-## Prerequisites
-
-Let's make sure you have everything you need before you get started.
-
-### Hardware
-
-To successfully complete this tutorial, you will need a Linux or MacOS host machine with Arm aarch64 or x86_64 processor architecture.
-
-The target device will be an emulated platform to enable development without a specific development board. This tutorial has guidance for both Ethos-U targets and VGF via the ML SDK for Vulkan®.
-
-For Ethos-U and Cortex-M, We will be using a [Fixed Virtual Platform (FVP)](https://www.arm.com/products/development-tools/simulation/fixed-virtual-platforms), simulating [Corstone-300](https://developer.arm.com/Processors/Corstone-300)(cs300) and [Corstone-320](https://developer.arm.com/Processors/Corstone-320)(cs320)systems. Since we will be using the FVP (think of it as virtual hardware), we won't be requiring any real embedded hardware for this tutorial.
-
-For VGF we will be using the [ML SDK for Vulkan(R)](https://github.com/arm/ai-ml-sdk-for-vulkan/)) to emulate the program consumer.
-
-### Software
-
-First, you will need to install ExecuTorch. Please follow the recommended tutorials if you haven't already, to set up a working ExecuTorch development environment. For the VGF backend it's recommended you [install from source](https://docs.pytorch.org/executorch/stable/using-executorch-building-from-source.html), or from a [nightly](https://download.pytorch.org/whl/nightly/executorch/).
-
-In addition to this, you need to install a number of SDK dependencies for generating Ethos-U command streams or VGF files. There are scripts which automate this, which are found in the main [ExecuTorch repository](https://github.com/pytorch/executorch/tree/main/examples/arm/).
-
-## Set Up the Developer Environment
-
-In this section, we will do a one-time setup of the platform support files needed to run ExecuTorch programs in this tutorial. It is recommended to run the script in a conda or venv environment.
-
-With a checkout of the ExecuTorch repository, we will use the `examples/arm/setup.sh` script to pull each item in an automated fashion. 
-
-For Ethos-U run:
-```bash
-./examples/arm/setup.sh --i-agree-to-the-contained-eula
-```
-
-For VGF run:
-```bash
-./examples/arm/setup.sh --i-agree-to-the-contained-eula --disable-ethos-u-deps --enable-mlsdk-deps
-```
-It is possible to install both sets of dependencies if you omit the disable options.
-
-
-### Notes:
-
-```{warning}
-The `setup.sh` script has generated a `setup_path.sh` script that you need to source whenever you restart your shell.
-```
-
-i.e. run
-`source  executorch/examples/arm/ethos-u-scratch/setup_path.sh`
-
-
-To confirm your environment is set up correctly and will enable you to generate .pte's for your target:
-
-For Ethos-U run:
-```bash
-# Check for Vela, which converts TOSA to Ethos-U command streams.
-which vela
-```
-
-For VGF run:
-```bash
-# Check for model-converter, which converts TOSA to ML-SDK VGF format.
-which model-converter
-```
-
-To ensure there's no environment pollution you should confirm these binaries reside within your executorch checkout, under the examples/arm tree. Other versions may present compatibility issues, so this should be corrected by modifying your environment variables such as ${PATH} appropriately.
-
-
-## Convert the PyTorch Model to the `.pte` File
-
-`.pte` is a binary file produced by ExecuTorch Ahead-of-Time (AoT) pipeline by taking in a PyTorch Model (a torch.nn.Module), exporting it, running a variety of passes, and finally serializing it to a `.pte` file format. This binary file is typically consumed by the ExecuTorch Runtime. This [document](https://github.com/pytorch/executorch/blob/main/docs/source/getting-started-architecture.md) goes in much more depth about the ExecuTorch software stack for both AoT as well as Runtime.
-
-In this section, we will primarily focus on the AoT flow with the end goal of producing a `.pte` file. There are a set of export configurations to target different backends at runtime. For each, the AoT flow will produce a unique `.pte` file. We will explore a couple of different configurations producing different `.pte` files, particularly interesting for our Corstone-300 system and available processing elements.
-
-Before we get started, let's first talk about the PyTorch modules we will be using.
-
-### PyTorch Example Modules
-We will use a couple of simple PyTorch Modules to explore the end-to-end flow. These modules will be used in various different ways throughout the tutorial, referring to them by their `<class_name>`.
-
-#### SoftmaxModule
-This is a very simple PyTorch module with just one [Softmax](https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html#torch.nn.Softmax) operator.
-
-```python
-import torch
-
-class SoftmaxModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.softmax = torch.nn.Softmax()
-
-    def forward(self, x):
-        z = self.softmax(x)
-        return z
-```
-
-Running it using the Python environment (on the same development Linux machine), you get the expected output.
-
-```python
->>> m = SoftmaxModule()
->>> m(torch.ones(2,2))
-tensor([[0.5000, 0.5000],
-        [0.5000, 0.5000]])
-```
-
-#### AddModule
-Let's write another simple PyTorch module with just one [Add](https://pytorch.org/docs/stable/generated/torch.add.html#torch.add) operator.
-
-```python
-class AddModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return x + x
-```
-
-Running it in python shows that 1 + 1 produces 2 as exepected:
-
-```python
->>> m = AddModule()
->>> m(torch.ones(5, dtype=torch.int32)) # integer types for non-quantized Ethos-U delegation
-tensor([2, 2, 2, 2, 2], dtype=torch.int32)
-```
-Keep the inputs and outputs to these modules in mind. When you will lower and run this through alternate means as opposed to running on this Linux machine, you will use the same inputs, and expect the outputs to match with the one shown here.
-
-```{tip}
-you need to be aware of data types for running networks on the Ethos-U as it is an integer only co-processor. For this example you use integer types explicitly, for typical use of such a flow networks are built and trained in floating point, and then are quantized from floating point to integer for efficient inference.
-```
-
-#### MobileNetV2 Module
-[MobileNetV2](https://arxiv.org/abs/1801.04381) is a commonly used network for edge and mobile devices.
-It's also available as a default model in [torchvision](https://github.com/pytorch/vision), so you can load it with the sample code below.
-```
-from torchvision.models import mobilenet_v2  # @manual
-from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
-
-mv2 = mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT)
-```
-For more details, refer to the code snippet [here](https://github.com/pytorch/executorch/blob/2354945d47f67f60d9a118ea1a08eef8ba2364b5/examples/models/mobilenet_v2/model.py#L18).
-
-### Non-delegated Workflow
-
-In the ExecuTorch AoT pipeline, one of the options is to select a backend. ExecuTorch offers a variety of different backends. Selecting backend is optional, it is typically done to target a particular mode of acceleration or hardware for a given model compute requirements. Without any backends, ExecuTorch runtime will fallback to using, available by default, a highly portable set of operators.
-
-It's expected that on platforms with dedicated acceleration like the Ethos-U55, that the non-delegated flow is used for two primary cases:
-1. When the network is designed to be very small and best suited to run on the Cortex-M alone.
-2. When the network has a mix of operations that can target the NPU and those that can't, e.g. the Ethos-U55 supports integer operations and so floating point softmax will fall back to execute on the CPU.
-
-In this flow, without any backend delegates, to illustrate the portability of the ExecuTorch runtime, as well as of the operator library you will skip specifying the backend during the `.pte` generation.
-
-Following script will serve as a helper utility to help generating the `.pte` file. This is available in the `examples/arm` directory.
-
-```bash
-python3 -m examples.arm.aot_arm_compiler --model_name="softmax"
-# This should produce ./softmax_arm_ethos-u55-128.pte
-```
-
-### Delegated Workflow
-
-Working with Arm, you introduced a new Arm backend delegate for ExecuTorch. This backend is under active development and has a limited set of features available as of writing this.
-
-By including a following step during the ExecuTorch AoT export pipeline to generate the `.pte` file, you can enable this backend delegate.
-
-```python
-from executorch.backends.arm.arm_backend import generate_ethosu_compile_spec
-
-graph_module_edge.exported_program = to_backend(
-    model.exported_program,
-    ArmPartitioner(generate_ethosu_compile_spec("ethos-u55-128")))
-```
-
-Similar to the non-delegate flow, the same script will server as a helper utility to help generate the `.pte` file. Notice the `--delegate` option to enable the `to_backend` call.
-
-For Ethos targets:
-```bash
-python3 -m examples.arm.aot_arm_compiler --model_name="add" --delegate
-# This targets the default of ethos-u55-128, see --help for further targets
-# should produce ./add_arm_delegate_ethos-u55-128.pte
-```
-
-For basic post-training quantization:
-```bash
-python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --delegate --quantize
-# This targets the default of ethos-u55-128, see --help for further targets
-# should produce ./mv2_arm_delegate_ethos-u55-128.pte
-```
-
-
-For VGF targets:
-```bash
-python3 -m examples.arm.aot_arm_compiler --model_name="add" --target=vgf --delegate
-# should produce ./add_arm_delegate_vgf.pte
-```
-
-For basic post-training quantization:
-```bash
-python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --target=vgf --delegate --quantize
-# should produce ./mv2_arm_delegate_vgf.pte
-```
-
-To capture intermediates such as VGF for lower level integration, invoke with the "-i" option:
-```bash
-python3 -m examples.arm.aot_arm_compiler --model_name="mv2" --target=vgf --delegate --quantize -i ./mv2_output
-# should produce ./mv2_arm_delegate_vgf.pte and intermediates in ./mv2_out/
-```
-
-<br />
-
-At the end of this, you should have a number of different `.pte` files.
-
-- the SoftmaxModule, without any backend delegates.
-- the AddModule, targeting the Arm Ethos-U backend.
-- the Quantized MV2Model, targeting the Arm Ethos-U backend.
-- the AddModule, targeting the VGF backend.
-- the Quantized MV2Model, targeting the VGF backend.
-
-Now let's try to run these `.pte` files on a target.
-
-## Getting a Bare-Metal Executable
-
-In this section, you will go over steps that you need to go through to build the runtime application. This then run on the target device. In the executorch repository you have a functioning script which does the exact same steps. It is located at `executorch/examples/arm/run.sh`. You will use that to build necessary pieces and finally run the previously generated PTE file on an FVP.
-
-By default the `run.sh` will use `arm_test/` as an build and output folder and you will find the build artifacts under it. This can be controlled/overrided with the `--et_build_root` and the `--output` flags if needed.
-
-e.g. running `examples/arm/run.sh --model_name=add --target=ethos-u85-128` will produce a pte and elf file like this:
-
-```bash
-arm_test/add/add_arm_delegate_ethos-u85-128.pte
-arm_test/add/cmake-out/arm_executor_runner
-```
-Also before you get started, make sure that you have completed ExecuTorch cmake build setup, and the instructions to setup the development environment described [earlier](#set-up-the-developer-environment).
-
-The block diagram below demonstrates, at the high level, how the various build artifacts are generated and are linked together to generate the final bare-metal executable.
-
-![](arm-delegate-runtime-build.svg)
-
-```{tip}
-The `generate_pte_file` function in `run.sh` script produces the `.pte` files based on the models provided through `--model_name` input argument
-```
-
-### Generating ExecuTorch Libraries
-
-ExecuTorch's CMake build system produces a set of build pieces which are critical to building the ExecuTorch runtime with-in the bare-metal environment you have for Corstone FVPs from Ethos-U SDK.
-
-[This](using-executorch-building-from-source.md) document provides a detailed overview of each individual build piece. For running either variant of the `.pte` file, you will need a core set of libraries. Here is a list,
-
-- `libexecutorch.a`
-- `libportable_kernels.a`
-- `libportable_ops_lib.a`
-
-To run a `.pte` file with the Arm backend delegate call instructions, you will need the Arm backend delegate runtime library, that is,
-
-- `libexecutorch_delegate_ethos_u.a`
-
-These libraries are generated by the `backends/arm/scripts/build_executorch.sh` script called from the `run.sh` script.
-
-### Building the executor_runner Bare-Metal Application
-
-The SDK dir is the same one prepared [earlier](#setup-the-arm-ethos-u-software-development). And, you will be passing the `.pte` file (any one of them) generated above.
-
-Note, you have to generate a new `executor-runner` binary if you want to change the model or the `.pte` file. This constraint is from the constrained bare-metal runtime environment you have for Corstone-300/Corstone-320 platforms. The build also generates a kernel registration library for the relevant operators which could not be delegated to the EthosU, see the [Kernel Library Selective Build documentation](https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html).
-
-This step is executed by the build_executor_runner.sh script, which is invoked from the run.sh in the backends/arm/scripts folder.
-
-```{tip}
-The `run.sh` script takes in `--target` option, which provides a way to provide a specific target, Corstone-300(ethos-u55-128) or Corstone-320(ethos-u85-128)
-```
-
-## Running on Corstone FVP Platforms
-
-Once the elf is prepared, regardless of the `.pte` file variant is used to generate the bare metal elf. `run.sh` will run the FVP for you via the `backends/arm/scripts/run_fvp.sh` script.
-
-#### Automatic FVP Selection 
-
-- To run a specific test model with the compiler flag and target 
-```bash
-./run.sh --model_name=mv2 --delegate --quantize --target=ethos-u85-128
-```
-
-- To run a specific test model and target 
-```bash
-./run.sh --model_name=mv2 --delegate --target=ethos-u85-128
-```
-
-- To run all the test models iteratively in a loop , simply run
-```bash
-./run.sh
-```
-
-Note that you could use `build_executor_runner.sh` and `run_fvp.sh` scripts in tandem by passing the relevant  --target argument (e.g., --target=ethos-u55-128), the correct FVP binary will be chosen automatically. For more details, see the [section on Runtime Integration](https://docs.pytorch.org/executorch/main/backends-arm-ethos-u.html#runtime-integration).
-
-
-#### Manual FVP Binary Selection
-
-- If you build for the Ethos delegate U55/U65 target (e.g., using --target=ethos-u55-128 or --target=ethos-u65-256 with `build_executor_runner.sh` and `run_fvp.sh`), you should use the corresponding FVP binary:
-  - For U55:
-    ```bash
-    examples/arm/ethos-u-scratch/FVP-corstone300/models/Linux64_GCC-9.3/FVP_Corstone_SSE-300_Ethos-U55
-    ```
-  - For U65:
-    ```bash
-    examples/arm/ethos-u-scratch/FVP-corstone300/models/Linux64_GCC-9.3/FVP_Corstone_SSE-300_Ethos-U65
-    ```
-- And say if you are not building for an Ethos target, use:
-  ```bash
-  examples/arm/ethos-u-scratch/FVP-corstone320/models/Linux64_GCC-9.3/FVP_Corstone_SSE-320
-  ```
-
-Following is an example usage:
-
-```bash
-ethos_u_build_dir=examples/arm/executor_runner/
-
-elf=$(find ${ethos_u_build_dir} -name "arm_executor_runner")
-
-FVP_Corstone_SSE-320                                    \
-    -C mps4_board.subsystem.ethosu.num_macs=128         \
-    -C mps4_board.visualisation.disable-visualisation=1 \
-    -C vis_hdlcd.disable_visualisation=1                \
-    -C mps4_board.telnetterminal0.start_telnet=0        \
-    -C mps4_board.uart0.out_file='-'                    \
-    -C mps4_board.uart0.shutdown_on_eot=1               \
-    -a "${elf}"                                         \
-    --timelimit 120 || true # seconds- after which sim will kill itself
-```
-
-#### Verification of Successful FVP Execution
-After running the FVP command, either automatically or manually, you should see output similar to the following on your shell if the execution is successful:
-
-```console
-I [executorch:arm_executor_runner.cpp:364] Model in 0x70000000 $
-I [executorch:arm_executor_runner.cpp:366] Model PTE file loaded. Size: 4425968 bytes.
-I [executorch:arm_executor_runner.cpp:376] Model buffer loaded, has 1 methods
-I [executorch:arm_executor_runner.cpp:384] Running method forward
-I [executorch:arm_executor_runner.cpp:395] Setup Method allocator pool. Size: 62914560 bytes.
-I [executorch:arm_executor_runner.cpp:412] Setting up planned buffer 0, size 752640.
-I [executorch:ArmBackendEthosU.cpp:79] ArmBackend::init 0x70000070
-I [executorch:arm_executor_runner.cpp:445] Method loaded.
-I [executorch:arm_executor_runner.cpp:447] Preparing inputs...
-I [executorch:arm_executor_runner.cpp:461] Input prepared.
-I [executorch:arm_executor_runner.cpp:463] Starting the model execution...
-I [executorch:ArmBackendEthosU.cpp:118] ArmBackend::execute 0x70000070
-I [executorch:ArmBackendEthosU.cpp:298] Tensor input/output 0 will be permuted
-I [executorch:arm_perf_monitor.cpp:120] NPU Inferences : 1
-I [executorch:arm_perf_monitor.cpp:121] Profiler report, CPU cycles per operator:
-I [executorch:arm_perf_monitor.cpp:125] ethos-u : cycle_cnt : 1498202 cycles
-I [executorch:arm_perf_monitor.cpp:132] Operator(s) total: 1498202 CPU cycles
-I [executorch:arm_perf_monitor.cpp:138] Inference runtime: 6925114 CPU cycles total
-I [executorch:arm_perf_monitor.cpp:140] NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency
-I [executorch:arm_perf_monitor.cpp:149] Inference CPU ratio: 99.99 %
-I [executorch:arm_perf_monitor.cpp:153] Inference NPU ratio: 0.01 %
-I [executorch:arm_perf_monitor.cpp:162] cpu_wait_for_npu_cntr : 729 CPU cycles
-I [executorch:arm_perf_monitor.cpp:167] Ethos-U PMU report:
-I [executorch:arm_perf_monitor.cpp:168] ethosu_pmu_cycle_cntr : 5920305
-I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr0 : 359921
-I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr1 : 0
-I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr2 : 0
-I [executorch:arm_perf_monitor.cpp:171] ethosu_pmu_cntr3 : 503
-I [executorch:arm_perf_monitor.cpp:178] Ethos-U PMU Events:[ETHOSU_PMU_EXT0_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT1_RD_DATA_BEAT_RECEIVED, ETHOSU_PMU_EXT0_WR_DATA_BEAT_WRITTEN, ETHOSU_PMU_NPU_IDLE]
-I [executorch:arm_executor_runner.cpp:470] model_pte_loaded_size:     4425968 bytes.
-I [executorch:arm_executor_runner.cpp:484] method_allocator_used:     1355722 / 62914560  free: 61558838 ( used: 2 % )
-I [executorch:arm_executor_runner.cpp:491] method_allocator_planned:  752640 bytes
-I [executorch:arm_executor_runner.cpp:493] method_allocator_loaded:   966 bytes
-I [executorch:arm_executor_runner.cpp:494] method_allocator_input:    602116 bytes
-I [executorch:arm_executor_runner.cpp:495] method_allocator_executor: 0 bytes
-I [executorch:arm_executor_runner.cpp:498] temp_allocator_used:       0 / 1048576 free: 1048576 ( used: 0 % )
-I [executorch:arm_executor_runner.cpp:152] Model executed successfully.
-I [executorch:arm_executor_runner.cpp:156] 1 outputs:
-Output[0][0]: -0.749744
-Output[0][1]: -0.019224
-Output[0][2]: 0.134570
-...(Skipped)
-Output[0][996]: -0.230691
-Output[0][997]: -0.634399
-Output[0][998]: -0.115345
-Output[0][999]: 1.576386
-I [executorch:arm_executor_runner.cpp:177] Program complete, exiting.
-I [executorch:arm_executor_runner.cpp:179]
-```
-
-```{note}
-The `run.sh` script provides various options to select a particular FVP target, use desired models, select portable kernels and can be explored using the `--help` argument
-```
-
-## Running on the VGF backend with the standard executor_runner for Linux
-
-Follow typical [Building ExecuTorch with CMake](using-executorch-building-from-source.md) flow to build the linux target, ensuring that the VGF delegate is enabled.
-
-```bash
--DEXECUTORCH_BUILD_VGF=ON
-```
-
-A full example buld line is:
-```
-cmake bash \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=OFF \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -DEXECUTORCH_BUILD_VGF=ON \
-    -DEXECUTORCH_ENABLE_LOGGING=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DPYTHON_EXECUTABLE=python \
-    -Bcmake-out .
-cmake --build cmake-out -j25 --target install --config Release
-```
-
-You can then invoke the executor runner on the host machine, which will use the VGF delegate, and requires the vulkan layer drivers we installed with setup.sh.
-
-```bash
-./cmake-out/executor_runner -model_path add_arm_delegate_vgf.pte
-```
-
-
-## Takeaways
-In this tutorial you have learnt how to use the ExecuTorch software to both export a standard model from PyTorch and to run it on the compact and fully functioned ExecuTorch runtime, enabling a smooth path for offloading models from PyTorch to Arm based platforms.
-
-To recap, there are two major flows:
- * A direct flow which offloads work onto the Cortex-M using libraries built into ExecuTorch.
- * A delegated flow which partitions the graph into sections for Cortex-M and sections which can be offloaded and accelerated on the Ethos-U hardware.
-
-Both of these flows continue to evolve, enabling more use-cases and better performance.
-
-## FAQs
-<!----
-Describe what common errors users may see and how to resolve them.
-
-* TODO - Binary size and operator Selection
-* TODO - Cross-compilation targeting baremetal
-* TODO - Debugging on FVP
------>
-
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/docs/source/tutorial-template.md b/docs/source/tutorial-template.md
index b25731afa17..73b787c9e2c 100644
--- a/docs/source/tutorial-template.md
+++ b/docs/source/tutorial-template.md
@@ -9,12 +9,12 @@
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
 * [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
+* [Setting up ExecuTorch](getting-started-setup.rst)
+* [Building ExecuTorch with CMake](using-executorch-building-from-source.md)
 :::
 ::::
 
-## Prerequsites (Hardware and Software)
+## Prerequisites (Hardware and Software)
 
 Provide instructions on what kind of hardware and software are pre-requisite for the tutorial.
 
diff --git a/docs/source/tutorial-xnnpack-delegate-lowering.md b/docs/source/tutorial-xnnpack-delegate-lowering.md
index bccd4e4add3..3fb079f24d6 100644
--- a/docs/source/tutorial-xnnpack-delegate-lowering.md
+++ b/docs/source/tutorial-xnnpack-delegate-lowering.md
@@ -11,7 +11,7 @@ In this tutorial, you will learn how to export an XNNPACK lowered Model and run
 :::{grid-item-card}  Before you begin it is recommended you go through the following:
 :class-card: card-prerequisites
 * [Setting up ExecuTorch](getting-started-setup.rst)
-* [Model Lowering Tutorial](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
+* [Model Lowering Tutorial](tutorials/export-to-executorch-tutorial) <!-- @lint-ignore -->
 * [ExecuTorch XNNPACK Delegate](backends-xnnpack.md)
 :::
 ::::
@@ -74,7 +74,7 @@ After lowering to the XNNPACK Program, we can then prepare it for executorch and
 
 
 ## Lowering a Quantized Model to XNNPACK
-The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Custom Quantization](quantization-custom-quantization.md) note. For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
+The XNNPACK delegate can also execute symmetrically quantized models. To understand the quantization flow and learn how to quantize models, refer to [Quantization Overview](quantization-overview.md). For the sake of this tutorial, we will leverage the `quantize()` python helper function conveniently added to the `executorch/executorch/examples` folder.
 
 ```python
 from torch.export import export
diff --git a/docs/source/usage.md b/docs/source/usage.md
new file mode 100644
index 00000000000..6ffc136093b
--- /dev/null
+++ b/docs/source/usage.md
@@ -0,0 +1,19 @@
+# Usage
+
+This section describes how to use Executorch. It covers everything from
+getting started to platform-specific implementations, runtime integration,
+troubleshooting, and frequently asked questions.
+
+```{toctree}
+:maxdepth: 1
+
+getting-started
+using-executorch-export
+using-executorch-android
+using-executorch-ios
+using-executorch-cpp
+using-executorch-runtime-integration
+using-executorch-troubleshooting
+using-executorch-building-from-source
+using-executorch-faqs
+```
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index 23513302063..ce9977218a1 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -72,7 +72,7 @@ curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250
 curl -O https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-20250412/executorch.aar.sha256sums
 ```
 
-We aim to make every daily snapshot available and useable. However, for best stability, please use releases, not snapshots.
+We aim to make every daily snapshot available and usable. However, for best stability, please use releases, not snapshots.
 
 ## Using AAR file
 
@@ -83,12 +83,12 @@ To add the AAR file to your app:
 An AAR file itself does not contain dependency info, unlike the Maven one which bundled with pom.xml. The Java package requires `fbjni` and `soloader`, and currently requires users to explicitly declare the dependency. Therefore, two more `dependencies` in gradle rule is required:
 ```
 implementation("com.facebook.soloader:soloader:0.10.5")
-implementation("com.facebook.fbjni:fbjni:0.5.1")
+implementation("com.facebook.fbjni:fbjni:0.7.0")
 ```
 
 ### Example usage
 
-In your app working directory, such as executorch/examples/demo-apps/android/LlamaDemo,
+In your app working directory, such as executorch-examples/llm/android/LlamaDemo,
 ```
 mkdir -p app/libs
 curl https://ossci-android.s3.amazonaws.com/executorch/release/${executorch_version}/executorch.aar -o app/libs/executorch.aar
@@ -100,7 +100,7 @@ And include it in gradle:
 dependencies {
     implementation(files("libs/executorch.aar"))
     implementation("com.facebook.soloader:soloader:0.10.5")
-    implementation("com.facebook.fbjni:fbjni:0.5.1")
+    implementation("com.facebook.fbjni:fbjni:0.7.0")
 }
 ```
 
@@ -112,7 +112,7 @@ Now you can compile your app with the ExecuTorch Android library.
 
 You need Android [SDK](https://developer.android.com/studio) and [NDK](https://developer.android.com/ndk/downloads) to use it.
 
-Current NDK version used in ExecuTorch CI: r27b.
+Current NDK version used in ExecuTorch CI: r28c.
 
 You need to set `ANDROID_HOME` to Android SDK home and `ANDROID_NDK` to the correct NDK root (containing NOTICE file).
 
@@ -202,7 +202,7 @@ adb push extension/module/test/resources/add.pte /data/local/tmp/
 This example loads an ExecuTorch module, prepares input data, runs inference, and processes the output data.
 
 Please use [DeepLabV3AndroidDemo](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3/android/DeepLabV3Demo)
-and [LlamaDemo](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo) for the code examples
+and [LlamaDemo](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo) for the code examples
 using ExecuTorch AAR package.
 
 ## Java API reference
diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md
index 3736226bc06..5505ade9573 100644
--- a/docs/source/using-executorch-cpp.md
+++ b/docs/source/using-executorch-cpp.md
@@ -69,7 +69,7 @@ The runner source code can be found in the ExecuTorch repo under [examples/porta
 
 ## Next Steps
 
-- [Runtime API Reference](executorch-runtime-api-reference.md) for documentation on the available C++ runtime APIs.
+- [Runtime API Reference](executorch-runtime-api-reference.rst) for documentation on the available C++ runtime APIs.
 - [Running an ExecuTorch Model Using the Module Extension in C++](extension-module.md) for information on the high-level Module API.
 - [Managing Tensor Memory in C++](extension-tensor.md) for information on high-level tensor APIs.
 - [Running an ExecuTorch Model in C++ Tutorial](running-a-model-cpp-tutorial.md) for information on the low-level runtime APIs.
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index 2a887bb346d..7abf5cbd30a 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -24,7 +24,7 @@ Quantization - the process of using reduced precision to reduce inference time a
 
 ExecuTorch backends provide hardware acceleration for a specific hardware target. In order to achieve maximum performance on target hardware, ExecuTorch optimizes the model for a specific backend during the export and lowering process. This means that the resulting .pte file is specialized for the specific hardware. In order to deploy to multiple backends, such as Core ML on iOS and Arm CPU on Android, it is common to generate a dedicated .pte file for each.
 
-The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requires and level of model support. See the documentation for each hardware backend for more details.
+The choice of hardware backend is informed by the hardware that the model is intended to be deployed on. Each backend has specific hardware requirements and level of model support. See the documentation for each hardware backend for more details.
 
 As part of the .pte file creation process, ExecuTorch identifies portions of the model (partitions) that are supported for the given backend. These sections are processed by the backend ahead of time to support efficient execution. Portions of the model that are not supported on the delegate, if any, are executed using the portable fallback implementation on CPU. This allows for partial model acceleration when not all model operators are supported on the backend, but may have negative performance implications. In addition, multiple partitioners can be specified in order of priority. This allows for operators not supported on GPU to run on CPU via XNNPACK, for example.
 
@@ -32,7 +32,7 @@ As part of the .pte file creation process, ExecuTorch identifies portions of the
 
 Commonly used hardware backends are listed below. For mobile, consider using XNNPACK for Android and XNNPACK or Core ML for iOS. To create a .pte file for a specific backend, pass the appropriate partitioner class to `to_edge_transform_and_lower`. See the appropriate backend documentation and the [Export and Lowering](#export-and-lowering) section below for more information.
 
-- [XNNPACK (Mobile CPU)](backends-xnnpack.md)
+- [XNNPACK (CPU)](backends-xnnpack.md)
 - [Core ML (iOS)](backends-coreml.md)
 - [Metal Performance Shaders (iOS GPU)](backends-mps.md)
 - [Vulkan (Android GPU)](backends-vulkan.md)
@@ -141,7 +141,6 @@ delegate_external_constants_pass_unlifted(
 exported_program = export(tagged_module, inputs, dynamic_shapes=dynamic_shapes)
 executorch_program = to_edge_transform_and_lower(
     exported_program,
-    transform_passes = [partial_function],
     partitioner = [XnnpackPartitioner()]
 ).to_executorch()
 ```
@@ -184,6 +183,7 @@ For more complex use cases, dynamic shape specification allows for mathematical
 Before integrating the runtime code, it is common to test the exported model from Python. This can be used to evaluate model accuracy and sanity check behavior before moving to the target device. Note that not all hardware backends are available from Python, as they may require specialized hardware to function. See the specific backend documentation for more information on hardware requirements and the availablilty of simulators. The XNNPACK delegate used in this example is always available on host machines.
 
 ```python
+import torch
 from executorch.runtime import Runtime
 
 runtime = Runtime.get()
@@ -194,9 +194,19 @@ method = program.load_method("forward")
 outputs = method.execute([input_tensor])
 ```
 
-Pybindings currently does not support loading program and data. To run a model with PTE and PTD components, please use the [Extension Module](extension-module.md). There is also an E2E demo in [executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/program-data-separation).
+To run a model with program and data separated, please use the [ExecuTorch Module pybindings](https://github.com/pytorch/executorch/blob/main/extension/pybindings/README.md).
+```python
+import torch
+from executorch.extension.pybindings import portable_lib
+
+input_tensor = torch.randn(1, 3, 32, 32)
+module = portable_lib._load_for_executorch("model.pte", "model.ptd")
+outputs = module.forward([input_tensor])
+```
+
+There is also an E2E demo in [executorch-examples](https://github.com/meta-pytorch/executorch-examples/tree/main/program-data-separation).
 
-For more information, see [Runtime API Reference](executorch-runtime-api-reference.md).
+For more information, see [Runtime API Reference](executorch-runtime-api-reference.rst).
 
 ## Advanced Topics
 
@@ -270,7 +280,7 @@ decode_ep = torch.export.export(DecodeWrapper(model), ...)
 
 ## Next Steps
 
-The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.md) for more information.
+The PyTorch and ExecuTorch export and lowering APIs provide a high level of customizability to meet the needs of diverse hardware and models. See [torch.export](https://pytorch.org/docs/main/export.html) and [Export API Reference](export-to-executorch-api-reference.rst) for more information.
 
 For advanced use cases, see the following:
 - [Quantization Overview](quantization-overview.md) for information on quantizing models to reduce inference time and memory footprint.
diff --git a/docs/source/using-executorch-faqs.md b/docs/source/using-executorch-faqs.md
index d1bd0390569..c147403c9e8 100644
--- a/docs/source/using-executorch-faqs.md
+++ b/docs/source/using-executorch-faqs.md
@@ -16,7 +16,7 @@ if you are using Ubuntu, or use an equivalent install command.
 
 ### ModuleNotFoundError: No module named 'pytorch_tokenizers'
 
-The `pytorch_tokenizers` package is required for LLM export functionality. Install it from the ExecutorTorch source code:
+The `pytorch_tokenizers` package is required for LLM export functionality. Install it from the ExecuTorch source code:
 ```
 pip install -e ./extension/llm/tokenizers/
 ```
@@ -48,7 +48,7 @@ Thread count can be set with the following function. Ensure this is done prior t
 ::executorch::extension::threadpool::get_threadpool()->_unsafe_reset_threadpool(num_threads);
 ```
 
-For a deeper investgiation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
+For a deeper investigation into model performance, ExecuTorch supports operator-level performance profiling. See [Using the ExecuTorch Developer Tools to Profile a Model](devtools-integration-tutorial.md) for more information.
 
 ### Missing Logs
 
diff --git a/docs/source/using-executorch-runtime-integration.md b/docs/source/using-executorch-runtime-integration.md
index 550cb3eb71a..36bc4f6b2fe 100644
--- a/docs/source/using-executorch-runtime-integration.md
+++ b/docs/source/using-executorch-runtime-integration.md
@@ -64,7 +64,7 @@ namespace {
 ```
 
 ### Weak Symbol Override
-ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatability.
+ExecuTorch also provides a link-time method to override the PAL using weak symbols. This method is primarily maintained for backwards compatibility.
 
 To override one or more PAL methods, take the following steps:
 
diff --git a/docs/source/using-executorch-troubleshooting.md b/docs/source/using-executorch-troubleshooting.md
index 56c2e1a0653..75648dc5b46 100644
--- a/docs/source/using-executorch-troubleshooting.md
+++ b/docs/source/using-executorch-troubleshooting.md
@@ -1,11 +1,11 @@
 # Profiling and Debugging
 
-To faciliate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
+To facilitate model and runtime integration, ExecuTorch provides tools to profile model resource utilization, numerics, and more. This section describes the available troubleshooting tools and steps to resolve issues when integrating ExecuTorch.
 
 ## General Troubleshooting Steps
 
 - To troubleshoot failure of runtime API calls, such as loading or running a model, ensure that ExecuTorch framework logging is enabled. See [Logging](using-executorch-runtime-integration.md#logging) for more information.
-- As a prelimatinary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
+- As a preliminary step to troubleshoot slow run times, ensure that performance testing is being done in a release build, and that the model is delegated. See [Inference is Slow](using-executorch-faqs.md#inference-is-slow--performance-troubleshooting) for more information.
 - Check [Frequently Asked Questions](using-executorch-faqs.md) for common issues and questions encountered during install, model export, and runtime integration.
 
 ## Developer Tools
@@ -16,5 +16,5 @@ The ExecuTorch developer tools, or devtools, are a collection of tooling for tro
 
 - [Frequently Asked Questions](using-executorch-faqs.md) for solutions to commonly encountered questions and issues.
 - [Introduction to the ExecuTorch Developer Tools](runtime-profiling.md) for a high-level introduction to available developer tooling.
-- [Using the ExecuTorch Developer Tools to Profile a Model](https://pytorch.org/executorch/main/tutorials/devtools-integration-tutorial) for information on runtime performance profiling.
+- [Using the ExecuTorch Developer Tools to Profile a Model](tutorials/devtools-integration-tutorial) <!-- @lint-ignore --> for information on runtime performance profiling.
 - [Inspector APIs](runtime-profiling.md) for reference material on trace inspector APIs.
diff --git a/examples/apple/coreml/llama/export.py b/examples/apple/coreml/llama/export.py
index 48edc3c0669..af2fa3c74ee 100644
--- a/examples/apple/coreml/llama/export.py
+++ b/examples/apple/coreml/llama/export.py
@@ -23,7 +23,6 @@
 from executorch.exir.backend.utils import format_delegated_graph
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.passes import MemoryPlanningPass
-from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.extension.export_util.utils import save_pte_program
 
@@ -211,9 +210,7 @@ def main() -> None:
     executorch_program = edge_manager.to_executorch(
         ExecutorchBackendConfig(
             extract_delegate_segments=True,
-            passes=[
-                QuantFusionPass(),
-            ],
+            do_quant_fusion_and_const_prop=True,
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
         )
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index 106ab35363c..34ed7e3f1bd 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -9,7 +9,6 @@
 
 import argparse
 import copy
-import json
 import logging
 import os
 
@@ -19,25 +18,24 @@
 import torch
 from examples.devtools.scripts.export_bundled_program import save_bundled_program
 from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
-from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner
-from executorch.backends.arm.quantizer import (
-    EthosUQuantizer,
-    get_symmetric_quantization_config,
-    TOSAQuantizer,
-    VgfQuantizer,
-)
+from executorch.backends.arm.ethosu import EthosUCompileSpec
+from executorch.backends.arm.quantizer import get_symmetric_quantization_config
 from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
-from executorch.backends.arm.tosa.partitioner import TOSAPartitioner
+from executorch.backends.arm.util._factory import create_partitioner, create_quantizer
 
 from executorch.backends.arm.util.arm_model_evaluator import (
-    GenericModelEvaluator,
-    MobileNetV2Evaluator,
+    evaluate_model,
+    evaluator_calibration_data,
 )
 
-from executorch.backends.arm.vgf import VgfCompileSpec, VgfPartitioner
+from executorch.backends.arm.vgf import VgfCompileSpec
 
 # To use Cortex-M backend
+from executorch.backends.cortex_m.passes.quantized_linear_fusion_pass import (
+    QuantizedLinearFusionPass,
+)
+
 from executorch.backends.cortex_m.passes.quantized_op_fusion_pass import (
     QuantizedOpFusionPass,
 )
@@ -55,8 +53,11 @@
     ExecutorchBackendConfig,
     to_edge_transform_and_lower,
 )
+
 from executorch.extension.export_util.utils import save_pte_program
 from tabulate import tabulate
+from torch.export import ExportedProgram
+from torch.fx import GraphModule
 from torch.utils.data import DataLoader
 
 # Quantize model if required using the standard export quantizaion flow.
@@ -141,25 +142,19 @@ def get_model_and_inputs_from_name(
 
 
 def quantize(
-    model: torch.nn.Module,
+    model: GraphModule,
     model_name: str,
     compile_specs: EthosUCompileSpec | VgfCompileSpec | TosaCompileSpec,
     example_inputs: Tuple[torch.Tensor],
     evaluator_name: str | None,
     evaluator_config: Dict[str, Any] | None,
-) -> torch.nn.Module:
-    """This is the official recommended flow for quantization in pytorch 2.0 export"""
+) -> GraphModule:
+    """This is the official recommended flow for quantization in pytorch 2.0
+    export"""
     logging.info("Quantizing Model...")
     logging.debug(f"Original model: {model}")
-    quantizer = None
-    if isinstance(compile_specs, EthosUCompileSpec):
-        quantizer = EthosUQuantizer(compile_specs)
-    elif isinstance(compile_specs, TosaCompileSpec):
-        quantizer = TOSAQuantizer(compile_specs)
-    elif isinstance(compile_specs, VgfCompileSpec):
-        quantizer = VgfQuantizer(compile_specs)
-    else:
-        raise RuntimeError("Unsupported compilespecs for quantization!")
+
+    quantizer = create_quantizer(compile_specs)
 
     operator_config = get_symmetric_quantization_config()
     quantizer.set_global(operator_config)
@@ -182,46 +177,6 @@ def quantize(
     return m
 
 
-# Simple example models
-class AddModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return x + x
-
-    example_input = (torch.ones(5, dtype=torch.int32),)
-    can_delegate = True
-
-
-class AddModule2(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return x + y
-
-    example_input = (
-        torch.ones(5, dtype=torch.int32),
-        torch.ones(5, dtype=torch.int32),
-    )
-    can_delegate = True
-
-
-class AddModule3(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return (x + y, x + x)
-
-    example_input = (
-        torch.ones(5, dtype=torch.int32),
-        torch.ones(5, dtype=torch.int32),
-    )
-    can_delegate = True
-
-
 class QuantAddTest(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -270,48 +225,29 @@ def forward(self, w, x, y, z):
     can_delegate = True  # when quantized
 
 
-class SoftmaxModule(torch.nn.Module):
+class QuantLinearTest(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        self.softmax = torch.nn.Softmax(dim=0)
+        # Define a simple linear layer
+        self.linear = torch.nn.Linear(61, 37)
 
     def forward(self, x):
-        z = self.softmax(x)
-        return z
+        return self.linear(x)
 
-    example_input = (torch.ones(2, 2),)
-    can_delegate = True
-
-
-class MultipleOutputsModule(torch.nn.Module):
-    def forward(self, x: torch.Tensor, y: torch.Tensor):
-        return (x * y, x.sum(dim=-1, keepdim=True))
-
-    example_input = (torch.randn(10, 4, 5), torch.randn(10, 4, 5))
+    example_input = (torch.randn([8, 61], dtype=torch.float32),)
     can_delegate = True
 
 
 models = {
-    "add": AddModule,
-    "add2": AddModule2,
-    "add3": AddModule3,
     "qadd": QuantAddTest,
     "qadd2": QuantAddTest2,
     "qops": QuantOpTest,
-    "softmax": SoftmaxModule,
-    "MultipleOutputsModule": MultipleOutputsModule,
+    # TODO: Remove this from here, once we have dedicated MCU test pipeline ready. This is an interim solution.
+    # See https://github.com/pytorch/executorch/discussions/13944
+    "qlinear": QuantLinearTest,
 }
 
 calibration_data = {
-    "add": (torch.randn(1, 5),),
-    "add2": (
-        torch.randn(1, 5),
-        torch.randn(1, 5),
-    ),
-    "add3": (
-        torch.randn(32, 5),
-        torch.randn(32, 5),
-    ),
     "qadd": (torch.randn(32, 2, 1),),
     "qadd2": (
         torch.randn(32, 2, 1),
@@ -323,12 +259,6 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         torch.randn(32, 2, 1) * -0.000001,
         torch.randn(32, 2, 1) * 1000,
     ),
-    "softmax": (torch.randn(32, 2, 2),),
-}
-
-evaluators = {
-    "generic": GenericModelEvaluator,
-    "mv2": MobileNetV2Evaluator,
 }
 
 targets = [
@@ -355,21 +285,9 @@ def get_calibration_data(
 ):
     # Firstly, if the model is being evaluated, take the evaluators calibration function if it has one
     if evaluator_name is not None:
-        evaluator = evaluators[evaluator_name]
-
-        if hasattr(evaluator, "get_calibrator"):
-            assert evaluator_config is not None
-
-            config_path = Path(evaluator_config)
-            with config_path.open() as f:
-                config = json.load(f)
-
-            if evaluator_name == "mv2":
-                return evaluator.get_calibrator(
-                    training_dataset_path=config["training_dataset_path"]
-                )
-            else:
-                raise RuntimeError(f"Unknown evaluator: {evaluator_name}")
+        evaluator_data = evaluator_calibration_data(evaluator_name, evaluator_config)
+        if evaluator_data is not None:
+            return evaluator_data
 
     # If the model is in the calibration_data dictionary, get the data from there
     # This is used for the simple model examples provided
@@ -397,11 +315,14 @@ def get_compile_spec(
             tosa_spec = TosaSpecification.create_from_string("TOSA-1.0+INT")
         compile_spec = TosaCompileSpec(tosa_spec)
     elif "ethos-u" in target:
+        extra_flags = ["--verbose-operators", "--verbose-cycle-estimate"]
+        if debug_mode is not None:
+            extra_flags.append("--enable-debug-db")
         compile_spec = EthosUCompileSpec(
             target,
             system_config=system_config,
             memory_mode=memory_mode,
-            extra_flags=["--verbose-operators", "--verbose-cycle-estimate"],
+            extra_flags=extra_flags,
             config_ini=config,
         )
     elif "vgf" in target:
@@ -423,52 +344,6 @@ def get_compile_spec(
     return compile_spec
 
 
-def evaluate_model(
-    model_name: str,
-    intermediates: str,
-    model_fp32: torch.nn.Module,
-    model_int8: torch.nn.Module,
-    example_inputs: Tuple[torch.Tensor],
-    evaluator_name: str,
-    evaluator_config: str | None,
-) -> None:
-    evaluator = evaluators[evaluator_name]
-
-    # Get the path of the TOSA flatbuffer that is dumped
-    intermediates_path = Path(intermediates)
-    tosa_paths = list(intermediates_path.glob("*.tosa"))
-
-    if evaluator.REQUIRES_CONFIG:
-        assert evaluator_config is not None
-
-        config_path = Path(evaluator_config)
-        with config_path.open() as f:
-            config = json.load(f)
-
-        if evaluator_name == "mv2":
-            init_evaluator = evaluator(
-                model_name,
-                model_fp32,
-                model_int8,
-                example_inputs,
-                str(tosa_paths[0]),
-                config["batch_size"],
-                config["validation_dataset_path"],
-            )
-        else:
-            raise RuntimeError(f"Unknown evaluator {evaluator_name}")
-    else:
-        init_evaluator = evaluator(
-            model_name, model_fp32, model_int8, example_inputs, str(tosa_paths[0])
-        )
-
-    quant_metrics = init_evaluator.evaluate()
-    output_json_path = intermediates_path / "quant_metrics.json"
-
-    with output_json_path.open("w") as json_file:
-        json.dump(quant_metrics, json_file)
-
-
 def dump_delegation_info(edge, intermediate_files_folder: Optional[str] = None):
     graph_module = edge.exported_program().graph_module
     delegation_info = get_delegation_info(graph_module)
@@ -535,7 +410,7 @@ def get_args():
         required=False,
         nargs="?",
         const="generic",
-        choices=["generic", "mv2"],
+        choices=["generic", "mv2", "deit_tiny"],
         help="Flag for running evaluation of the model.",
     )
     parser.add_argument(
@@ -593,7 +468,7 @@ def get_args():
         "--config",
         required=False,
         default="Arm/vela.ini",
-        help="Specify custom vela configuration file (vela.ini)",
+        help="Specify custom vela configuration file (vela.ini) for Ethos-U targets.",
     )
     parser.add_argument(
         "--non_strict_export",
@@ -605,13 +480,13 @@ def get_args():
     parser.add_argument(
         "--enable_qdq_fusion_pass",
         action="store_true",
-        help="Enable the QuantizedOpFusionPass fusion step",
+        help="Enable the Quantized qdq fusion Op passes",
     )
     parser.add_argument(
         "--enable_debug_mode",
         required=False,
         choices=["json", "tosa"],
-        help="Flag to enable ATen-to-TOSA debug mode.",
+        help="Flag to enable ATen-to-TOSA debug mode and dumping of Vela's debug database.",
     )
     args = parser.parse_args()
 
@@ -718,7 +593,12 @@ def save_bpte_program(exec_prog, original_model: torch.nn.Module, output_name: s
     save_bundled_program(exec_prog, method_test_suites, output_name)
 
 
-def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
+def quantize_model(
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+    compile_spec,
+) -> Tuple[GraphModule, ExportedProgram]:
     model_int8 = quantize(
         model,
         args.model_name,
@@ -736,7 +616,10 @@ def quantize_model(args, model: torch.nn.Module, example_inputs, compile_spec):
 
 
 def to_edge_TOSA_delegate(
-    exported_program, args, model: torch.nn.Module, example_inputs
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
 ):
     # As we can target multiple output encodings, one must
     # be specified.
@@ -755,16 +638,8 @@ def to_edge_TOSA_delegate(
         model_int8, exported_program = quantize_model(
             args, model, example_inputs, compile_spec
         )
-        model = model_int8
-
-    if isinstance(compile_spec, EthosUCompileSpec):
-        partitioner = EthosUPartitioner(compile_spec)
-    elif isinstance(compile_spec, TosaCompileSpec):
-        partitioner = TOSAPartitioner(compile_spec)
-    elif isinstance(compile_spec, VgfCompileSpec):
-        partitioner = VgfPartitioner(compile_spec)
-    else:
-        raise RuntimeError(f"Unhandled compile spec: {compile_spec}")
+
+    partitioner = create_partitioner(compile_spec)
 
     edge = to_edge_transform_and_lower(
         exported_program,
@@ -777,7 +652,12 @@ def to_edge_TOSA_delegate(
     return model_int8, edge
 
 
-def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_inputs):
+def to_edge_no_delegate(
+    exported_program: ExportedProgram,
+    args,
+    model: GraphModule,
+    example_inputs: Tuple[torch.Tensor],
+):
     model_int8 = None
     if args.quantize:
         # As we can target multiple output encodings, one must
@@ -806,22 +686,24 @@ def to_edge_no_delegate(exported_program, args, model: torch.nn.Module, example_
     return model_int8, edge
 
 
-def transform_for_cortex_m_backend(edge, args):
+def transform_for_cortex_m_backend(edge_program_manager, args):
     # Let's make sure we are using optimized Cortex M backend
     # NB: If we can't find and replace ops those are expected to be replaced,
     # bad things will happen at runtime, like "missing operator" errors!
 
     # Instantiate the mandatory ReplaceQuantNodesPass
-    passes = [ReplaceQuantNodesPass()]
-
-    # Conditionally add the QuantizedOpFusionPass
+    passes = [ReplaceQuantNodesPass]
     if args.enable_qdq_fusion_pass:
-        passes.append(QuantizedOpFusionPass())
-
-    # Apply the passes
-    edge = edge.transform(passes)
-
-    return edge
+        passes += [QuantizedLinearFusionPass, QuantizedOpFusionPass]
+    current_edge = edge_program_manager
+    for pass_cls in passes:
+        transform_pass = (
+            pass_cls(current_edge.exported_program())
+            if pass_cls.__name__ == "QuantizedLinearFusionPass"
+            else pass_cls()
+        )
+        current_edge = current_edge.transform([transform_pass])
+    return current_edge
 
 
 if __name__ == "__main__":  # noqa: C901
diff --git a/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
new file mode 100644
index 00000000000..11590a8578f
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/0001-Remove-hello_world-from-applications.patch
@@ -0,0 +1,25 @@
+From f6a7d867212336b3e344c21240a2a03671bffd65 Mon Sep 17 00:00:00 2001
+From: Per Held <per.held@arm.com>
+Date: Wed, 17 Sep 2025 13:46:05 +0200
+Subject: Remove hello_world from applications
+
+---
+ applications/CMakeLists.txt | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/applications/CMakeLists.txt b/applications/CMakeLists.txt
+index a017575..130f0f7 100644
+--- a/applications/CMakeLists.txt
++++ b/applications/CMakeLists.txt
+@@ -21,7 +21,7 @@ add_subdirectory(driver_unit_tests)
+ 
+ add_subdirectory(freertos)
+ 
+-add_subdirectory(hello_world)
++#add_subdirectory(hello_world)
+ 
+ add_subdirectory(threadx_demo)
+ 
+-- 
+2.43.0
+
diff --git a/examples/arm/ethos_u_minimal_example.ipynb b/examples/arm/ethos_u_minimal_example.ipynb
index dc8ea7193aa..6637800e456 100644
--- a/examples/arm/ethos_u_minimal_example.ipynb
+++ b/examples/arm/ethos_u_minimal_example.ipynb
@@ -58,7 +58,7 @@
     "model = Add()\n",
     "model = model.eval()\n",
     "exported_program = torch.export.export(model, example_inputs)\n",
-    "graph_module = exported_program.module()\n",
+    "graph_module = exported_program.graph_module\n",
     "\n",
     "_ = graph_module.print_readable()"
    ]
@@ -160,7 +160,7 @@
     "            config=ExecutorchBackendConfig(extract_delegate_segments=False)\n",
     "        )\n",
     "\n",
-    "_ = executorch_program_manager.exported_program().module().print_readable()\n",
+    "_ = executorch_program_manager.exported_program().graph_module.print_readable()\n",
     "\n",
     "# Save pte file\n",
     "save_pte_program(executorch_program_manager, \"ethos_u_minimal_example.pte\")"
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 4e4a8eeb409..d5038a1a6b8 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -235,10 +235,10 @@ list(
   -Map=arm_executor_runner.map
 )
 
-# Prefer to generate kernel bindings from model file if possible, which is when
-# 1. Not building for semihosting 2. Not building with bundleio If that is not
-# the case, fallback to select_ops_list If the model file does not contain any
-# aten ops, a workaround is currently needed to avoid crashing.
+# Figure out which ops to include: For semihosting build, use
+# (user-set)SELECT_OPS_MODEL variable. For normal build, use
+# EXECUTORCH_SELECT_OPS_MODEL to include ops automatically. If the pte contains
+# no undelegated ops, use neither.
 execute_process(
   COMMAND
     python "${ET_DIR_PATH}/codegen/tools/gen_oplist.py"
@@ -264,11 +264,6 @@ elseif(${FOUND_OPS_IN_FILE})
   message(
     "gen_oplist:  EXECUTORCH_SELECT_OPS_MODEL=${ET_PTE_FILE_PATH} is used to auto generate ops from"
   )
-elseif(NOT ${FOUND_OPS_IN_FILE} AND ${ET_BUNDLE_IO})
-  set(EXECUTORCH_SELECT_OPS_MODEL "")
-  message(
-    "gen_oplist: Building with ET_BUNDLE_IO and .bpte is not supported to auto generate ops from will use EXECUTORCH_SELECT_OPS_LIST=${EXECUTORCH_SELECT_OPS_LIST}"
-  )
 else()
   set(EXECUTORCH_SELECT_OPS_LIST "")
   set(EXECUTORCH_SELECT_OPS_MODEL "")
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 8f5dec85ad4..91e34b09cbd 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -53,8 +53,8 @@ function help() {
     echo "  --no_delegate                          Do not delegate the model (can't override builtin models)"
     echo "  --no_quantize                          Do not quantize the model (can't override builtin models)"
     echo "  --portable_kernels=<OPS>               TO BE DEPRECATED: Alias to select_ops_list."
-    echo "  --select_ops_list=<OPS>                Comma separated list of portable (non delegated) kernels to include Default: ${select_ops_list}"
-    echo "                                           NOTE: This is used when select_ops_model is not possible to use, e.g. for semihosting or bundleio."
+    echo "  --select_ops_list=<OPS>                Comma separated list of portable (non delagated) kernels to include Default: ${select_ops_list}"
+    echo "                                           NOTE: This is only used when building for semihosting."
     echo "                                           See https://docs.pytorch.org/executorch/stable/kernel-library-selective-build.html for more information."
     echo "  --target=<TARGET>                      Target to build and run for Default: ${target}"
     echo "  --output=<FOLDER>                      Target build output folder Default: ${output_folder}"
@@ -225,7 +225,6 @@ if [[ -z "$model_name" ]]; then
     test_model=(
         "softmax"   # 0
         "add"       # 1
-        "add3"      # 2
         "qadd"      # 3
         "qadd2"     # 4
         "qops"      # 5
@@ -234,7 +233,6 @@ if [[ -z "$model_name" ]]; then
     model_compiler_flags=(
         ""                      # 0 softmax
         "--delegate"            # 1 add
-        "--delegate"            # 2 add3
         "--delegate --quantize" # 3 qadd
         "--delegate --quantize" # 4 qadd2
         "--delegate --quantize" # 5 qops
diff --git a/examples/arm/run_mcu_models_fvp.sh b/examples/arm/run_mcu_models_fvp.sh
index 68d5ec03003..3fa980c506b 100755
--- a/examples/arm/run_mcu_models_fvp.sh
+++ b/examples/arm/run_mcu_models_fvp.sh
@@ -24,9 +24,9 @@ VALID_TARGETS=(
 )
 
 # Default models for MCU validation with portable kernels
-DEFAULT_MODELS=(mv2 mv3 lstm)
+DEFAULT_MODELS=(mv2 mv3 lstm qadd qlinear)
 # Available models (on FVP)
-AVAILABLE_MODELS=(mv2 mv3 lstm)
+AVAILABLE_MODELS=(mv2 mv3 lstm qadd qlinear)
 # Add the following models if you want to enable them later (atm they are not working on FVP)
 # edsr w2l ic3 ic4 resnet18 resnet50
 
@@ -257,6 +257,7 @@ for model in "${MODELS[@]}"; do
         -m "$model" \
         --target="$ETHOS_TARGET" \
         --quantize \
+        --enable_qdq_fusion_pass \
         --output="arm_test/$model"; then
         echo "❌ AOT compilation failed for $model"
         MODEL_SUCCESS=false
diff --git a/examples/arm/vgf_minimal_example.ipynb b/examples/arm/vgf_minimal_example.ipynb
index 36004f2c7cd..1f8e0a61601 100644
--- a/examples/arm/vgf_minimal_example.ipynb
+++ b/examples/arm/vgf_minimal_example.ipynb
@@ -56,8 +56,8 @@
     "\n",
     "model = Add()\n",
     "model = model.eval()\n",
-    "exported_program = torch.export.export_for_training(model, example_inputs)\n",
-    "graph_module = exported_program.module()\n",
+    "exported_program = torch.export.export(model, example_inputs)\n",
+    "graph_module = exported_program.graph_module\n",
     "\n",
     "_ = graph_module.print_readable()"
    ]
@@ -197,7 +197,7 @@
     "            config=ExecutorchBackendConfig(extract_delegate_segments=False)\n",
     ")\n",
     "\n",
-    "executorch_program_manager.exported_program().module().print_readable()\n",
+    "executorch_program_manager.exported_program().graph_module.print_readable()\n",
     "\n",
     "# Save pte file\n",
     "cwd_dir = os.getcwd()\n",
@@ -240,6 +240,7 @@
     "  -DCMAKE_BUILD_TYPE=Debug \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \\\n",
+    "  -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \\\n",
     "  -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \\\n",
     "  -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \\\n",
diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py
index 1b576a1a3eb..f393cd30037 100644
--- a/examples/cadence/models/babyllama.py
+++ b/examples/cadence/models/babyllama.py
@@ -14,8 +14,10 @@
 
 from executorch.backends.cadence.aot.export_example import export_and_run_model
 
-from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer
-
+from executorch.examples.models.llama.llama_transformer import (
+    construct_transformer,
+    ModelArgs,
+)
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -32,7 +34,7 @@ def main() -> None:
     )
     seq = 64
     b = 1
-    model = Transformer(args)
+    model = construct_transformer(args)
     example_inputs = (torch.randint(0, 10, [b, seq], dtype=torch.int64),)
 
     export_and_run_model(model, example_inputs)
diff --git a/examples/cuda/scripts/__init__.py b/examples/cuda/scripts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/examples/cuda/scripts/export.py b/examples/cuda/scripts/export.py
new file mode 100644
index 00000000000..c103d7ee50a
--- /dev/null
+++ b/examples/cuda/scripts/export.py
@@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer with CUDA delegate.
+
+import argparse
+import pathlib
+
+import torch
+
+from executorch.backends.cuda.cuda_backend import CudaBackend
+
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
+
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+
+from executorch.extension.export_util.utils import save_pte_program
+from torch._inductor.decomposition import conv1d_to_conv2d
+from torch.nn.attention import SDPBackend
+
+# Script to export a model with CUDA delegation.
+
+_EDGE_COMPILE_CONFIG = EdgeCompileConfig(
+    _check_ir_validity=False,
+    _skip_dim_order=True,  # TODO(T182928844): enable dim_order in backend
+)
+
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        required=True,
+        help=f"Provide model name. Valid ones: {list(MODEL_NAME_TO_MODEL.keys())}",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=pathlib.Path,
+        default=pathlib.Path("./"),
+        help="Output directory for the exported model",
+    )
+    parser.add_argument("--generate_etrecord", action=argparse.BooleanOptionalAction)
+    parser.add_argument("--save_processed_bytes", action=argparse.BooleanOptionalAction)
+
+    args = parser.parse_args()
+    return args
+
+
+def save_processed_bytes(processed_bytes, base_name: str):
+    filename = f"{base_name}.bin"
+    print(f"Saving processed bytes to {filename}")
+    with open(filename, "wb") as file:
+        file.write(processed_bytes)
+    return
+
+
+def main():
+    args = parse_args()
+
+    if args.model_name not in MODEL_NAME_TO_MODEL:
+        raise RuntimeError(
+            f"Model {args.model_name} is not a valid name. "
+            f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+        )
+
+    (
+        model,
+        example_args,
+        example_kwargs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[args.model_name])
+    model = model.eval()
+    exported_programs = torch.export.export(
+        model,
+        args=example_args,
+        kwargs=example_kwargs,
+        dynamic_shapes=dynamic_shapes,
+    )
+    print(exported_programs)
+
+    partitioner = CudaPartitioner(
+        [CudaBackend.generate_method_name_compile_spec(args.model_name)]
+    )
+    # Add decompositions for triton to generate kernels.
+    exported_programs = exported_programs.run_decompositions(
+        {
+            torch.ops.aten.conv1d.default: conv1d_to_conv2d,
+        }
+    )
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
+        et_prog = to_edge_transform_and_lower(
+            exported_programs,
+            partitioner=[partitioner],
+            compile_config=_EDGE_COMPILE_CONFIG,
+            generate_etrecord=args.generate_etrecord,
+        )
+    exec_program = et_prog.to_executorch()
+    save_pte_program(exec_program, args.model_name, args.output_dir)
+    if args.generate_etrecord:
+        exec_program.get_etrecord().save(f"{args.model_name}_etrecord.bin")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/demo-apps/android/LlamaDemo/.gitignore b/examples/demo-apps/android/LlamaDemo/.gitignore
deleted file mode 100644
index 41853c0472c..00000000000
--- a/examples/demo-apps/android/LlamaDemo/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-*.iml
-.gradle
-/local.properties
-.idea
-.DS_Store
-/build
-/captures
-.externalNativeBuild
-.cxx
-local.properties
-*.so
-*.aar
diff --git a/examples/demo-apps/android/LlamaDemo/README.md b/examples/demo-apps/android/LlamaDemo/README.md
deleted file mode 100644
index 9a6b3b020e7..00000000000
--- a/examples/demo-apps/android/LlamaDemo/README.md
+++ /dev/null
@@ -1,174 +0,0 @@
-# ExecuTorch Llama Android Demo App
-
-**[UPDATE - 2025-05-15]** We have added support for running Qwen3 0.6B and 4B model. Please see [this tutorial](https://github.com/pytorch/executorch/tree/main/examples/models/qwen3#summary) for export. Loading and running Qwen3 with this app is the same as Llama, as in this doc.
-
-We’re excited to share that the newly revamped Android demo app is live and includes many new updates to provide a more intuitive and smoother user experience with a chat use case! The primary goal of this app is to showcase how easily ExecuTorch can be integrated into an Android demo app and how to exercise the many features ExecuTorch and Llama models have to offer.
-
-This app serves as a valuable resource to inspire your creativity and provide foundational code that you can customize and adapt for your particular use case.
-
-Please dive in and start exploring our demo app today! We look forward to any feedback and are excited to see your innovative ideas.
-
-
-## Key Concepts
-From this demo app, you will learn many key concepts such as:
-* How to prepare Llama models, build the ExecuTorch library, and model inferencing across delegates
-* Expose the ExecuTorch library via JNI layer
-* Familiarity with current ExecuTorch app-facing capabilities
-
-The goal is for you to see the type of support ExecuTorch provides and feel comfortable with leveraging it for your use cases.
-
-## Supporting Models
-As a whole, the models that this app supports are (varies by delegate):
-* Llama 3.2 Quantized 1B/3B
-* Llama 3.2 1B/3B in BF16
-* Llama Guard 3 1B
-* Llama 3.1 8B
-* Llama 3 8B
-* Llama 2 7B
-* LLaVA-1.5 vision model (only XNNPACK)
-* Qwen 3 0.6B, 1.7B, and 4B
-
-
-## Building the APK
-First it’s important to note that currently ExecuTorch provides support across 3 delegates. Once you identify the delegate of your choice, select the README link to get a complete end-to-end instructions for environment set-up to exporting the models to build ExecuTorch libraries and apps to run on device:
-
-| Delegate      | Resource |
-| ------------- | ------------- |
-| XNNPACK (CPU-based library)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md) |
-| QNN (Qualcomm AI Accelerators)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md) |
-| MediaTek (MediaTek AI Accelerators)  | [link](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md)  |
-
-
-## How to Use the App
-
-This section will provide the main steps to use the app, along with a code snippet of the ExecuTorch API.
-
-For loading the app, development, and running on device we recommend Android Studio:
-1. Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo.
-2. Run the app (^R). This builds and launches the app on the phone.
-
-### Opening the App
-
-Below are the UI features for the app.
-
-Select the settings widget to get started with picking a model, its parameters and any prompts.
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
-</p>
-
-
-
-### Select Models and Parameters
-
-Once you've selected the model, tokenizer, and model type you are ready to click on "Load Model" to have the app load the model and go back to the main Chat activity.
-<p align="center">
-      <img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/settings_menu.png" style="width:300px">
-</p>
-
-
-
-Optional Parameters:
-* Temperature: Defaulted to 0, you can adjust the temperature for the model as well. The model will reload upon any adjustments.
-* System Prompt: Without any formatting, you can enter in a system prompt. For example, "you are a travel assistant" or "give me a response in a few sentences".
-* User Prompt: More for the advanced user, if you would like to manually input a prompt then you can do so by modifying the `{{user prompt}}`. You can also modify the special tokens as well. Once changed then go back to the main Chat activity to send.
-
-#### ExecuTorch App API
-
-```java
-// Upon returning to the Main Chat Activity
-mModule = new LlmModule(
-            ModelUtils.getModelCategory(mCurrentSettingsFields.getModelType()),
-            modelPath,
-            tokenizerPath,
-            temperature);
-int loadResult = mModule.load();
-```
-
-* `modelCategory`: Indicate whether it’s a text-only or vision model
-* `modePath`: path to the .pte file
-* `tokenizerPath`: path to the tokenizer file
-* `temperature`: model parameter to adjust the randomness of the model’s output
-
-
-### User Prompt
-Once model is successfully loaded then enter any prompt and click the send (i.e. generate) button to send it to the model.
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/load_complete_and_start_prompt.png" style="width:300px">
-</p>
-
-You can provide it more follow-up questions as well.
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/chat.png" style="width:300px">
-</p>
-
-#### ExecuTorch App API
-
-```java
-mModule.generate(prompt,sequence_length, MainActivity.this);
-```
-* `prompt`: User formatted prompt
-* `sequence_length`: Number of tokens to generate in response to a prompt
-* `MainActivity.this`: Indicate that the callback functions (OnResult(), OnStats()) are present in this class.
-
-[*LLaVA-1.5: Only for XNNPACK delegate*]
-
-For LLaVA-1.5 implementation, select the exported LLaVA .pte and tokenizer file in the Settings menu and load the model. After this you can send an image from your gallery or take a live picture along with a text prompt to the model.
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/llava_example.png" style="width:300px">
-</p>
-
-
-### Output Generated
-To show completion of the follow-up question, here is the complete detailed response from the model.
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/chat_response.png" style="width:300px">
-</p>
-
-#### ExecuTorch App API
-
-Ensure you have the following functions in your callback class that you provided in the `mModule.generate()`. For this example, it is `MainActivity.this`.
-```java
-  @Override
-  public void onResult(String result) {
-    //...result contains token from response
-    //.. onResult will continue to be invoked until response is complete
-  }
-
-  @Override
-  public void onStats(String stats) {
-    //... will be a json. See extension/llm/stats.h for the field definitions
-  }
-
-```
-
-## Instrumentation Test
-You can run the instrumentation test for sanity check. The test loads a model pte file and tokenizer.bin file
-under `/data/local/tmp/llama`.
-
-### Model preparation
-Go to ExecuTorch root,
-```sh
-curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
-curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
-# Create params.json file
-touch params.json
-echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override="fp16" export.output_name=stories110m_h.pte model.use_kv_cache=True
-python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
-```
-### Push model
-```sh
-adb mkdir -p /data/local/tmp/llama
-adb push stories110m_h.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-```
-
-### Run test
-Go to `examples/demo-apps/android/LlamaDemo`,
-```sh
-./gradlew connectedAndroidTest
-```
-
-## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new), or join our discord [here](https://lnkd.in/gWCM4ViK).
diff --git a/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md b/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md
deleted file mode 100644
index 9ae79e96763..00000000000
--- a/examples/demo-apps/android/LlamaDemo/SDK-quick-setup-guide.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Guide to set up Java/SDK/NDK for Android
-
-Follow this doc if you haven't set up Java/SDK/NDK for Android development
-already.
-This doc provides a CLI tutorial to set them up. Otherwise, you can do the same
-thing with Android Studio GUI.
-
-## Set up Java 17
-1. Download the archive from Oracle website.
-Make sure you have read and agree with the terms and conditions from the website before downloading.
-```bash
-export DEV_HOME=<path-to-dev>
-cd $DEV_HOME
-```
-Linux:
-```bash
-curl https://download.oracle.com/java/17/archive/jdk-17.0.10_linux-x64_bin.tar.gz -o jdk-17.0.10.tar.gz
-```
-macOS:
-```bash
-curl https://download.oracle.com/java/17/archive/jdk-17.0.10_macos-aarch64_bin.tar.gz -o jdk-17.0.10.tar.gz
-```
-2. Unzip the archive. The directory named `jdk-17.0.10` is the Java root directory.
-```bash
-tar xf jdk-17.0.10.tar.gz
-```
-3. Set `JAVA_HOME` and update `PATH`.
-
-Linux:
-```bash
-export JAVA_HOME="$DEV_HOME"/jdk-17.0.10
-export PATH="$JAVA_HOME/bin:$PATH"
-```
-macOS:
-```bash
-export JAVA_HOME="$DEV_HOME"/jdk-17.0.10.jdk/Contents/Home
-export PATH="$JAVA_HOME/bin:$PATH"
-```
-
-Note: Oracle has tutorials for installing Java on
-[Linux](https://docs.oracle.com/en/java/javase/17/install/installation-jdk-linux-platforms.html#GUID-4A6BD592-1840-4BB4-A758-4CD49E9EE88B)
-and [macOS](https://docs.oracle.com/en/java/javase/17/install/installation-jdk-macos.html#GUID-E8A251B6-D9A9-4276-ABC8-CC0DAD62EA33).
-Some Linux distributions has JDK package in package manager. For example, Debian users can install
-openjdk-17-jdk package.
-
-## Set up Android SDK/NDK
-Android has a command line tool [sdkmanager](https://developer.android.com/tools/sdkmanager) which
-helps users managing SDK and other tools related to Android development.
-
-1. Go to https://developer.android.com/studio and download the archive from "Command line tools
-only" section. Make sure you have read and agree with the terms and conditions from the website.
-
-Linux:
-```bash
-curl https://dl.google.com/android/repository/commandlinetools-linux-11076708_latest.zip -o commandlinetools.zip
-```
-macOS:
-```bash
-curl https://dl.google.com/android/repository/commandlinetools-mac-11076708_latest.zip -o commandlinetools.zip
-```
-2. Unzip.
-```bash
-unzip commandlinetools.zip
-```
-3. Specify a root for Android SDK. For example, we can put it under `$DEV_HOME/sdk`.
-
-```
-mkdir -p $DEV_HOME/sdk
-export ANDROID_HOME="$(realpath $DEV_HOME/sdk)"
-# Install SDK 34
-./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "platforms;android-34"
-# Install NDK
-./cmdline-tools/bin/sdkmanager --sdk_root="${ANDROID_HOME}" --install "ndk;26.3.11579264"
-# The NDK root is then under `ndk/<version>`.
-export ANDROID_NDK="$ANDROID_HOME/ndk/26.3.11579264"
-```
-
-### (Optional) Android Studio Setup
-If you want to use Android Studio and never set up Java/SDK/NDK before, or if
-you use the newly installed ones, follow these steps to set Android Studio to use
-them.
-
-Copy these output paths to be used by Android Studio
-```bash
-echo $ANDROID_HOME
-echo $ANDROID_NDK
-echo $JAVA_HOME
-```
-
-Open a project in Android Studio. In Project Structure (File -> Project
-Structure, or `⌘;`) -> SDK Location,
-* Set Android SDK Location to the path of $ANDROID_HOME
-* Set Android NDK Location to the path of $ANDROID_NDK
-* Set JDK location (Click Gradle Settings link) -> Gradle JDK -> Add JDK... to the path of $JAVA_HOME
diff --git a/examples/demo-apps/android/LlamaDemo/app/.gitignore b/examples/demo-apps/android/LlamaDemo/app/.gitignore
deleted file mode 100644
index 796b96d1c40..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/build
diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
deleted file mode 100644
index 19cfda847db..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-plugins {
-  id("com.android.application")
-  id("org.jetbrains.kotlin.android")
-}
-
-val qnnVersion: String? = project.findProperty("qnnVersion") as? String
-
-android {
-  namespace = "com.example.executorchllamademo"
-  compileSdk = 34
-
-  defaultConfig {
-    applicationId = "com.example.executorchllamademo"
-    minSdk = 28
-    targetSdk = 33
-    versionCode = 1
-    versionName = "1.0"
-
-    testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-    vectorDrawables { useSupportLibrary = true }
-    externalNativeBuild { cmake { cppFlags += "" } }
-  }
-
-  buildTypes {
-    release {
-      isMinifyEnabled = false
-      proguardFiles(getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro")
-    }
-  }
-  compileOptions {
-    sourceCompatibility = JavaVersion.VERSION_1_8
-    targetCompatibility = JavaVersion.VERSION_1_8
-  }
-  kotlinOptions { jvmTarget = "1.8" }
-  buildFeatures { compose = true }
-  composeOptions { kotlinCompilerExtensionVersion = "1.4.3" }
-  packaging { resources { excludes += "/META-INF/{AL2.0,LGPL2.1}" } }
-}
-
-dependencies {
-  implementation("androidx.core:core-ktx:1.9.0")
-  implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.1")
-  implementation("androidx.activity:activity-compose:1.7.0")
-  implementation(platform("androidx.compose:compose-bom:2023.03.00"))
-  implementation("androidx.compose.ui:ui")
-  implementation("androidx.compose.ui:ui-graphics")
-  implementation("androidx.compose.ui:ui-tooling-preview")
-  implementation("androidx.compose.material3:material3")
-  implementation("androidx.appcompat:appcompat:1.6.1")
-  implementation("androidx.camera:camera-core:1.3.0-rc02")
-  implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
-  implementation("com.facebook.fbjni:fbjni:0.5.1")
-  implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch.aar"))
-  implementation("com.google.android.material:material:1.12.0")
-  implementation("androidx.activity:activity:1.9.0")
-  implementation("org.json:json:20250107")
-  if (!qnnVersion.isNullOrEmpty()) {
-    implementation("com.qualcomm.qti:qnn-runtime:$qnnVersion")
-  }
-  testImplementation("junit:junit:4.13.2")
-  androidTestImplementation("androidx.test.ext:junit:1.1.5")
-  androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-  androidTestImplementation(platform("androidx.compose:compose-bom:2023.03.00"))
-  androidTestImplementation("androidx.compose.ui:ui-test-junit4")
-  debugImplementation("androidx.compose.ui:ui-tooling")
-  debugImplementation("androidx.compose.ui:ui-test-manifest")
-}
-
-tasks.register("setup") {
-  doFirst {
-    exec {
-      commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup.sh")
-      workingDir("../../../../../")
-    }
-  }
-}
-
-tasks.register("setupQnn") {
-  doFirst {
-    exec {
-      commandLine("sh", "examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh")
-      workingDir("../../../../../")
-    }
-  }
-}
-
-tasks.register("download_prebuilt_lib") {
-  doFirst {
-    exec {
-      commandLine("sh", "examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh")
-      workingDir("../../../../../")
-    }
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/proguard-rules.pro b/examples/demo-apps/android/LlamaDemo/app/proguard-rules.pro
deleted file mode 100644
index 481bb434814..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/proguard-rules.pro
+++ /dev/null
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java b/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
deleted file mode 100644
index 32ec24a0df9..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-import android.os.Bundle;
-import androidx.test.ext.junit.runners.AndroidJUnit4;
-import androidx.test.platform.app.InstrumentationRegistry;
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.pytorch.executorch.extension.llm.LlmCallback;
-import org.pytorch.executorch.extension.llm.LlmModule;
-
-@RunWith(AndroidJUnit4.class)
-public class PerfTest implements LlmCallback {
-
-  private static final String RESOURCE_PATH = "/data/local/tmp/llama/";
-  private static final String TOKENIZER_BIN = "tokenizer.bin";
-
-  private final List<String> results = new ArrayList<>();
-  private final List<Float> tokensPerSecond = new ArrayList<>();
-
-  @Test
-  public void testTokensPerSecond() {
-    String tokenizerPath = RESOURCE_PATH + TOKENIZER_BIN;
-    // Find out the model name
-    File directory = new File(RESOURCE_PATH);
-    Arrays.stream(directory.listFiles())
-        .filter(file -> file.getName().endsWith(".pte"))
-        .forEach(
-            model -> {
-              LlmModule mModule = new LlmModule(model.getPath(), tokenizerPath, 0.8f);
-              // Print the model name because there might be more than one of them
-              report("ModelName", model.getName());
-
-              int loadResult = mModule.load();
-              // Check that the model can be load successfully
-              assertEquals(0, loadResult);
-
-              // Run a testing prompt
-              mModule.generate("How do you do! I'm testing llama2 on mobile device", PerfTest.this);
-              assertFalse(tokensPerSecond.isEmpty());
-
-              final Float tps = tokensPerSecond.get(tokensPerSecond.size() - 1);
-              report("TPS", tps);
-            });
-  }
-
-  @Override
-  public void onResult(String result) {
-    results.add(result);
-  }
-
-  @Override
-  public void onStats(String result) {
-    try {
-      JSONObject jsonObject = new JSONObject(result);
-      int numGeneratedTokens = jsonObject.getInt("generated_tokens");
-      int inferenceEndMs = jsonObject.getInt("inference_end_ms");
-      int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
-      float tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
-      tokensPerSecond.add(tps);
-    } catch (JSONException e) {
-    }
-  }
-
-  private void report(final String metric, final Float value) {
-    Bundle bundle = new Bundle();
-    bundle.putFloat(metric, value);
-    InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle);
-  }
-
-  private void report(final String key, final String value) {
-    Bundle bundle = new Bundle();
-    bundle.putString(key, value);
-    InstrumentationRegistry.getInstrumentation().sendStatus(0, bundle);
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
deleted file mode 100644
index 7096a7d4e76..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/AndroidManifest.xml
+++ /dev/null
@@ -1,85 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    package="com.example.executorchllamademo">
-
-    <uses-sdk
-        android:maxSdkVersion="40"
-        android:minSdkVersion="28"
-        android:targetSdkVersion="34" />
-
-    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
-    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
-    <uses-permission android:name="android.permission.CAMERA" />
-
-    <uses-feature android:name="android.hardware.camera" />
-
-    <application
-        android:name=".ETLogging"
-        android:allowBackup="false"
-        android:dataExtractionRules="@xml/data_extraction_rules"
-        android:extractNativeLibs="true"
-        android:fullBackupContent="@xml/backup_rules"
-        android:icon="@drawable/logo"
-        android:label="@string/app_name"
-        android:supportsRtl="true"
-        android:theme="@style/Theme.AppCompat.Light.NoActionBar"
-        tools:targetApi="34">
-        <activity
-            android:name=".LogsActivity"
-            android:exported="false" />
-        <activity
-            android:name=".SettingsActivity"
-            android:exported="false" />
-
-        <uses-native-library
-            android:name="libcdsprpc.so"
-            android:required="false" />
-
-        <uses-native-library
-            android:name="libapuwareutils_v2.mtk.so"
-            android:required="false" />
-
-        <uses-native-library
-            android:name="libapuwareapusys_v2.mtk.so"
-            android:required="false" />
-
-        <uses-native-library
-            android:name="libnir_neon_driver_ndk.mtk.so"
-            android:required="false" />
-
-        <uses-native-library
-            android:name="libnir_neon_driver_ndk.mtk.vndk.so"
-            android:required="false" />
-
-        <uses-native-library
-            android:name="libcmdl_ndk.mtk.vndk.so"
-            android:required="false" />
-
-        <uses-native-library
-            android:name="libcmdl_ndk.mtk.so"
-            android:required="false" />
-
-        <activity
-            android:name=".MainActivity"
-            android:exported="true"
-            android:label="@string/app_name"
-            android:theme="@style/Theme.AppCompat.Light.NoActionBar">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-
-        <activity
-            android:name=".LlmBenchmarkRunner"
-            android:exported="true">
-            <intent-filter>
-                <action android:name="com.example.executorchllamademo.BENCHMARK" />
-            </intent-filter>
-        </activity>
-
-    </application>
-
-</manifest>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK b/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
deleted file mode 100644
index a64e11d1306..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/BUCK
+++ /dev/null
@@ -1,67 +0,0 @@
-load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
-load("@fbsource//tools/build_defs/android:fb_android_binary.bzl", "fb_android_binary")
-load("@fbsource//tools/build_defs/android:fb_android_library.bzl", "fb_android_library")
-load("@fbsource//tools/build_defs/android:fb_android_resource.bzl", "fb_android_resource")
-
-oncall("executorch")
-
-non_fbcode_target(_kind = fb_android_resource,
-    name = "app_res",
-    package = "com.example.executorchllamademo",
-    res = "res",
-)
-
-non_fbcode_target(_kind = fb_android_library,
-    name = "app_lib",
-    srcs = [
-        "java/com/example/executorchllamademo/AppLog.java",
-        "java/com/example/executorchllamademo/BackendType.java",
-        "java/com/example/executorchllamademo/DemoSharedPreferences.java",
-        "java/com/example/executorchllamademo/ETImage.java",
-        "java/com/example/executorchllamademo/ETLogging.java",
-        "java/com/example/executorchllamademo/LlmBenchmarkRunner.java",
-        "java/com/example/executorchllamademo/LogsActivity.java",
-        "java/com/example/executorchllamademo/LogsAdapter.java",
-        "java/com/example/executorchllamademo/MainActivity.java",
-        "java/com/example/executorchllamademo/Message.java",
-        "java/com/example/executorchllamademo/MessageAdapter.java",
-        "java/com/example/executorchllamademo/MessageType.java",
-        "java/com/example/executorchllamademo/ModelRunner.java",
-        "java/com/example/executorchllamademo/ModelRunnerCallback.java",
-        "java/com/example/executorchllamademo/ModelType.java",
-        "java/com/example/executorchllamademo/ModelUtils.java",
-        "java/com/example/executorchllamademo/PromptFormat.java",
-        "java/com/example/executorchllamademo/SettingsActivity.java",
-        "java/com/example/executorchllamademo/SettingsFields.java",
-    ],
-    autoglob = False,
-    language = "JAVA",
-    deps = [
-        ":app_res",
-        "//third-party/java/androidx/constraintlayout/constraintlayout:constraintlayout",
-        "//third-party/java/com/google/code/gson/gson:gson",
-        "//xplat/executorch/extension/android:executorch_llama",
-    ],
-)
-
-non_fbcode_target(_kind = fb_android_binary,
-    name = "ExecuTorchLlamaDemo",
-    keystore = "//fbandroid/keystores:debug",
-    manifest = "AndroidManifest.xml",
-    manifest_entries = {
-        "min_sdk_version": 21,
-        "target_sdk_version": 34,
-        "version_code": "1",
-        "version_name": "1.0",
-    },
-    package_type = "release",
-    skip_proguard = True,
-    deps = [
-        ":app_lib",
-        ":app_res",
-        "//third-party/java/androidx/appcompat/appcompat:appcompat",
-        "//third-party/java/com/google/code/gson/gson:gson",
-        "//xplat/executorch/extension/android:executorch_llama",
-        "//xplat/executorch/extension/android/jni:executorch_llama_jni",
-    ],
-)
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java
deleted file mode 100644
index 36d07419381..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/AppLog.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-
-public class AppLog {
-  private final Long timestamp;
-  private final String message;
-
-  public AppLog(String message) {
-    this.timestamp = getCurrentTimeStamp();
-    this.message = message;
-  }
-
-  public Long getTimestamp() {
-    return timestamp;
-  }
-
-  public String getMessage() {
-    return message;
-  }
-
-  public String getFormattedLog() {
-    return "[" + getFormattedTimeStamp() + "] " + message;
-  }
-
-  private Long getCurrentTimeStamp() {
-    return System.currentTimeMillis();
-  }
-
-  private String getFormattedTimeStamp() {
-    return formatDate(timestamp);
-  }
-
-  private String formatDate(long milliseconds) {
-    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd  HH:mm:ss", Locale.getDefault());
-    Date date = new Date(milliseconds);
-    return formatter.format(date);
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
deleted file mode 100644
index 7c84799795f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/BackendType.java
+++ /dev/null
@@ -1,7 +0,0 @@
-package com.example.executorchllamademo;
-
-public enum BackendType {
-  XNNPACK,
-  QUALCOMM,
-  MEDIATEK
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java
deleted file mode 100644
index 99a94c00ebb..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/DemoSharedPreferences.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.content.Context;
-import android.content.SharedPreferences;
-import com.google.gson.Gson;
-import com.google.gson.reflect.TypeToken;
-import java.lang.reflect.Type;
-import java.util.ArrayList;
-
-public class DemoSharedPreferences {
-  Context context;
-  SharedPreferences sharedPreferences;
-
-  public DemoSharedPreferences(Context context) {
-    this.context = context;
-    this.sharedPreferences = getSharedPrefs();
-  }
-
-  private SharedPreferences getSharedPrefs() {
-    return context.getSharedPreferences(
-        context.getString(R.string.demo_pref_file_key), Context.MODE_PRIVATE);
-  }
-
-  public String getSavedMessages() {
-    return sharedPreferences.getString(context.getString(R.string.saved_messages_json_key), "");
-  }
-
-  public void addMessages(MessageAdapter messageAdapter) {
-    SharedPreferences.Editor editor = sharedPreferences.edit();
-    Gson gson = new Gson();
-    String msgJSON = gson.toJson(messageAdapter.getSavedMessages());
-    editor.putString(context.getString(R.string.saved_messages_json_key), msgJSON);
-    editor.apply();
-  }
-
-  public void removeExistingMessages() {
-    SharedPreferences.Editor editor = sharedPreferences.edit();
-    editor.remove(context.getString(R.string.saved_messages_json_key));
-    editor.apply();
-  }
-
-  public void addSettings(SettingsFields settingsFields) {
-    SharedPreferences.Editor editor = sharedPreferences.edit();
-    Gson gson = new Gson();
-    String settingsJSON = gson.toJson(settingsFields);
-    editor.putString(context.getString(R.string.settings_json_key), settingsJSON);
-    editor.apply();
-  }
-
-  public String getSettings() {
-    return sharedPreferences.getString(context.getString(R.string.settings_json_key), "");
-  }
-
-  public void saveLogs() {
-    SharedPreferences.Editor editor = sharedPreferences.edit();
-    Gson gson = new Gson();
-    String msgJSON = gson.toJson(ETLogging.getInstance().getLogs());
-    editor.putString(context.getString(R.string.logs_json_key), msgJSON);
-    editor.apply();
-  }
-
-  public void removeExistingLogs() {
-    SharedPreferences.Editor editor = sharedPreferences.edit();
-    editor.remove(context.getString(R.string.logs_json_key));
-    editor.apply();
-  }
-
-  public ArrayList<AppLog> getSavedLogs() {
-    String logsJSONString =
-        sharedPreferences.getString(context.getString(R.string.logs_json_key), null);
-    if (logsJSONString == null || logsJSONString.isEmpty()) {
-      return new ArrayList<>();
-    }
-    Gson gson = new Gson();
-    Type type = new TypeToken<ArrayList<AppLog>>() {}.getType();
-    ArrayList<AppLog> appLogs = gson.fromJson(logsJSONString, type);
-    if (appLogs == null) {
-      return new ArrayList<>();
-    }
-    return appLogs;
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
deleted file mode 100644
index e68c8472626..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETImage.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.content.ContentResolver;
-import android.graphics.Bitmap;
-import android.graphics.BitmapFactory;
-import android.graphics.Color;
-import android.net.Uri;
-import androidx.annotation.Nullable;
-import java.io.FileNotFoundException;
-import java.io.InputStream;
-
-public class ETImage {
-  private int width;
-  private int height;
-  private final byte[] bytes;
-  private final Uri uri;
-  private final ContentResolver contentResolver;
-
-  ETImage(ContentResolver contentResolver, Uri uri) {
-    this.contentResolver = contentResolver;
-    this.uri = uri;
-    bytes = getBytesFromImageURI(uri);
-  }
-
-  public int getWidth() {
-    return width;
-  }
-
-  public int getHeight() {
-    return height;
-  }
-
-  public Uri getUri() {
-    return uri;
-  }
-
-  public byte[] getBytes() {
-    return bytes;
-  }
-
-  public int[] getInts() {
-    // We need to convert the byte array to an int array because
-    // the runner expects an int array as input.
-    int[] intArray = new int[bytes.length];
-    for (int i = 0; i < bytes.length; i++) {
-      intArray[i] = (bytes[i++] & 0xFF);
-    }
-    return intArray;
-  }
-
-  private byte[] getBytesFromImageURI(Uri uri) {
-    try {
-      int RESIZED_IMAGE_WIDTH = 336;
-      Bitmap bitmap = resizeImage(uri, RESIZED_IMAGE_WIDTH);
-
-      if (bitmap == null) {
-        ETLogging.getInstance().log("Unable to get bytes from Image URI. Bitmap is null");
-        return new byte[0];
-      }
-
-      width = bitmap.getWidth();
-      height = bitmap.getHeight();
-
-      byte[] rgbValues = new byte[width * height * 3];
-
-      for (int y = 0; y < height; y++) {
-        for (int x = 0; x < width; x++) {
-          // Get the color of the current pixel
-          int color = bitmap.getPixel(x, y);
-
-          // Extract the RGB values from the color
-          int red = Color.red(color);
-          int green = Color.green(color);
-          int blue = Color.blue(color);
-
-          // Store the RGB values in the byte array
-          rgbValues[y * width + x] = (byte) red;
-          rgbValues[(y * width + x) + height * width] = (byte) green;
-          rgbValues[(y * width + x) + 2 * height * width] = (byte) blue;
-        }
-      }
-      return rgbValues;
-    } catch (FileNotFoundException e) {
-      throw new RuntimeException(e);
-    }
-  }
-
-  @Nullable
-  private Bitmap resizeImage(Uri uri, int maxLength) throws FileNotFoundException {
-    InputStream inputStream = contentResolver.openInputStream(uri);
-    if (inputStream == null) {
-      ETLogging.getInstance().log("Unable to resize image, input streams is null");
-      return null;
-    }
-    Bitmap bitmap = BitmapFactory.decodeStream(inputStream);
-    if (bitmap == null) {
-      ETLogging.getInstance().log("Unable to resize image, bitmap during decode stream is null");
-      return null;
-    }
-
-    float aspectRatio;
-    int finalWidth, finalHeight;
-
-    if (bitmap.getWidth() > bitmap.getHeight()) {
-      // width > height --> width = maxLength, height scale with aspect ratio
-      aspectRatio = bitmap.getWidth() / (float) bitmap.getHeight();
-      finalWidth = maxLength;
-      finalHeight = Math.round(maxLength / aspectRatio);
-    } else {
-      // height >= width --> height = maxLength, width scale with aspect ratio
-      aspectRatio = bitmap.getHeight() / (float) bitmap.getWidth();
-      finalHeight = maxLength;
-      finalWidth = Math.round(maxLength / aspectRatio);
-    }
-
-    return Bitmap.createScaledBitmap(bitmap, finalWidth, finalHeight, false);
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java
deleted file mode 100644
index e595348945f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ETLogging.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.app.Application;
-import android.util.Log;
-import java.util.ArrayList;
-
-public class ETLogging extends Application {
-  private static ETLogging singleton;
-
-  private ArrayList<AppLog> logs;
-  private DemoSharedPreferences mDemoSharedPreferences;
-
-  @Override
-  public void onCreate() {
-    super.onCreate();
-    singleton = this;
-    mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext());
-    logs = mDemoSharedPreferences.getSavedLogs();
-    if (logs == null) { // We don't have existing sharedPreference stored
-      logs = new ArrayList<>();
-    }
-  }
-
-  public static ETLogging getInstance() {
-    return singleton;
-  }
-
-  public void log(String message) {
-    AppLog appLog = new AppLog(message);
-    logs.add(appLog);
-    Log.d("ETLogging", appLog.getMessage());
-  }
-
-  public ArrayList<AppLog> getLogs() {
-    return logs;
-  }
-
-  public void clearLogs() {
-    logs.clear();
-    mDemoSharedPreferences.removeExistingLogs();
-  }
-
-  public void saveLogs() {
-    mDemoSharedPreferences.saveLogs();
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
deleted file mode 100644
index 8c2d60252a0..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.app.Activity;
-import android.app.ActivityManager;
-import android.content.Intent;
-import android.os.Build;
-import android.os.Bundle;
-import android.util.Log;
-import android.widget.TextView;
-import androidx.annotation.NonNull;
-import com.google.gson.Gson;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
-  ModelRunner mModelRunner;
-
-  String mPrompt;
-  TextView mTextView;
-  StatsDump mStatsDump;
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_benchmarking);
-    mTextView = findViewById(R.id.log_view);
-
-    Intent intent = getIntent();
-
-    File modelDir = new File(intent.getStringExtra("model_dir"));
-    File model =
-        Arrays.stream(modelDir.listFiles())
-            .filter(file -> file.getName().endsWith(".pte"))
-            .findFirst()
-            .get();
-    String tokenizerPath = intent.getStringExtra("tokenizer_path");
-
-    float temperature = intent.getFloatExtra("temperature", 0.8f);
-    mPrompt = intent.getStringExtra("prompt");
-    if (mPrompt == null) {
-      mPrompt = "The ultimate answer";
-    }
-
-    mStatsDump = new StatsDump();
-    mStatsDump.modelName = model.getName().replace(".pte", "");
-    mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
-    mStatsDump.loadStart = System.nanoTime();
-  }
-
-  @Override
-  public void onModelLoaded(int status) {
-    mStatsDump.loadEnd = System.nanoTime();
-    mStatsDump.loadStatus = status;
-    if (status != 0) {
-      Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
-      onGenerationStopped();
-      return;
-    }
-    mStatsDump.generateStart = System.nanoTime();
-    mModelRunner.generate(mPrompt);
-  }
-
-  @Override
-  public void onTokenGenerated(String token) {
-    runOnUiThread(
-        () -> {
-          mTextView.append(token);
-        });
-  }
-
-  @Override
-  public void onStats(String stats) {
-    mStatsDump.tokens = stats;
-  }
-
-  @Override
-  public void onGenerationStopped() {
-    mStatsDump.generateEnd = System.nanoTime();
-    runOnUiThread(
-        () -> {
-          mTextView.append(mStatsDump.toString());
-        });
-
-    final BenchmarkMetric.BenchmarkModel benchmarkModel =
-        BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName);
-    final List<BenchmarkMetric> results = new ArrayList<>();
-    // The list of metrics we have atm includes:
-    // Load status
-    results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0));
-    // Model load time
-    results.add(
-        new BenchmarkMetric(
-            benchmarkModel,
-            "model_load_time(ms)",
-            (mStatsDump.loadEnd - mStatsDump.loadStart) * 1e-6,
-            0.0f));
-    // LLM generate time
-    results.add(
-        new BenchmarkMetric(
-            benchmarkModel,
-            "generate_time(ms)",
-            (mStatsDump.generateEnd - mStatsDump.generateStart) * 1e-6,
-            0.0f));
-    // Token per second
-    results.add(
-        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f));
-
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
-      Gson gson = new Gson();
-      writer.write(gson.toJson(results));
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
-  }
-
-  private double extractTPS(final String tokens) {
-    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
-    if (m.find()) {
-      return Double.parseDouble(m.group());
-    } else {
-      return 0.0f;
-    }
-  }
-}
-
-class BenchmarkMetric {
-  public static class BenchmarkModel {
-    // The model name, i.e. stories110M
-    String name;
-    String backend;
-    String quantization;
-
-    public BenchmarkModel(final String name, final String backend, final String quantization) {
-      this.name = name;
-      this.backend = backend;
-      this.quantization = quantization;
-    }
-  }
-
-  BenchmarkModel benchmarkModel;
-
-  // The metric name, i.e. TPS
-  String metric;
-
-  // The actual value and the option target value
-  double actualValue;
-  double targetValue;
-
-  public static class DeviceInfo {
-    // Let's see which information we want to include here
-    final String device = Build.BRAND;
-    // The phone model and Android release version
-    final String arch = Build.MODEL;
-    final String os = "Android " + Build.VERSION.RELEASE;
-    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
-    final long availMem = new ActivityManager.MemoryInfo().availMem;
-  }
-
-  DeviceInfo deviceInfo = new DeviceInfo();
-
-  public BenchmarkMetric(
-      final BenchmarkModel benchmarkModel,
-      final String metric,
-      final double actualValue,
-      final double targetValue) {
-    this.benchmarkModel = benchmarkModel;
-    this.metric = metric;
-    this.actualValue = actualValue;
-    this.targetValue = targetValue;
-  }
-
-  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
-  // the .pte model itself instead of parsing its name
-  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
-    final Matcher m =
-        Pattern.compile("(?<name>\\w+)_(?<backend>[\\w\\+]+)_(?<quantization>\\w+)").matcher(model);
-    if (m.matches()) {
-      return new BenchmarkMetric.BenchmarkModel(
-          m.group("name"), m.group("backend"), m.group("quantization"));
-    } else {
-      return new BenchmarkMetric.BenchmarkModel(model, "", "");
-    }
-  }
-}
-
-class StatsDump {
-  int loadStatus;
-  long loadStart;
-  long loadEnd;
-  long generateStart;
-  long generateEnd;
-  String tokens;
-  String modelName;
-
-  @NonNull
-  @Override
-  public String toString() {
-    return "loadStart: "
-        + loadStart
-        + "\nloadEnd: "
-        + loadEnd
-        + "\ngenerateStart: "
-        + generateStart
-        + "\ngenerateEnd: "
-        + generateEnd
-        + "\n"
-        + tokens;
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
deleted file mode 100644
index 7777b275e6e..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsActivity.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.app.AlertDialog;
-import android.content.DialogInterface;
-import android.os.Build;
-import android.os.Bundle;
-import android.widget.ImageButton;
-import android.widget.ListView;
-import androidx.appcompat.app.AppCompatActivity;
-import androidx.core.content.ContextCompat;
-import androidx.core.graphics.Insets;
-import androidx.core.view.ViewCompat;
-import androidx.core.view.WindowInsetsCompat;
-
-public class LogsActivity extends AppCompatActivity {
-
-  private LogsAdapter mLogsAdapter;
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_logs);
-    if (Build.VERSION.SDK_INT >= 21) {
-      getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar));
-      getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar));
-    }
-    ViewCompat.setOnApplyWindowInsetsListener(
-        requireViewById(R.id.main),
-        (v, insets) -> {
-          Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars());
-          v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom);
-          return insets;
-        });
-
-    setupLogs();
-    setupClearLogsButton();
-  }
-
-  @Override
-  public void onResume() {
-    super.onResume();
-    mLogsAdapter.clear();
-    mLogsAdapter.addAll(ETLogging.getInstance().getLogs());
-    mLogsAdapter.notifyDataSetChanged();
-  }
-
-  private void setupLogs() {
-    ListView mLogsListView = requireViewById(R.id.logsListView);
-    mLogsAdapter = new LogsAdapter(this, R.layout.logs_message);
-
-    mLogsListView.setAdapter(mLogsAdapter);
-    mLogsAdapter.addAll(ETLogging.getInstance().getLogs());
-    mLogsAdapter.notifyDataSetChanged();
-  }
-
-  private void setupClearLogsButton() {
-    ImageButton clearLogsButton = requireViewById(R.id.clearLogsButton);
-    clearLogsButton.setOnClickListener(
-        view -> {
-          new AlertDialog.Builder(this)
-              .setTitle("Delete Logs History")
-              .setMessage("Do you really want to delete logs history?")
-              .setIcon(android.R.drawable.ic_dialog_alert)
-              .setPositiveButton(
-                  android.R.string.yes,
-                  new DialogInterface.OnClickListener() {
-                    public void onClick(DialogInterface dialog, int whichButton) {
-                      // Clear the messageAdapter and sharedPreference
-                      ETLogging.getInstance().clearLogs();
-                      mLogsAdapter.clear();
-                      mLogsAdapter.notifyDataSetChanged();
-                    }
-                  })
-              .setNegativeButton(android.R.string.no, null)
-              .show();
-        });
-  }
-
-  @Override
-  protected void onDestroy() {
-    super.onDestroy();
-    ETLogging.getInstance().saveLogs();
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java
deleted file mode 100644
index 76c6a1aa1b4..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LogsAdapter.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.view.LayoutInflater;
-import android.view.View;
-import android.view.ViewGroup;
-import android.widget.ArrayAdapter;
-import android.widget.TextView;
-import androidx.annotation.NonNull;
-import java.util.Objects;
-
-public class LogsAdapter extends ArrayAdapter<AppLog> {
-  public LogsAdapter(android.content.Context context, int resource) {
-    super(context, resource);
-  }
-
-  static class ViewHolder {
-    private TextView logTextView;
-  }
-
-  @NonNull
-  @Override
-  public View getView(int position, View convertView, @NonNull ViewGroup parent) {
-    ViewHolder mViewHolder = null;
-
-    String logMessage = Objects.requireNonNull(getItem(position)).getFormattedLog();
-
-    if (convertView == null || convertView.getTag() == null) {
-      mViewHolder = new ViewHolder();
-      convertView = LayoutInflater.from(getContext()).inflate(R.layout.logs_message, parent, false);
-      mViewHolder.logTextView = convertView.requireViewById(R.id.logsTextView);
-    } else {
-      mViewHolder = (ViewHolder) convertView.getTag();
-    }
-    mViewHolder.logTextView.setText(logMessage);
-    return convertView;
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
deleted file mode 100644
index f995c5bc65a..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MainActivity.java
+++ /dev/null
@@ -1,847 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.Manifest;
-import android.app.ActivityManager;
-import android.app.AlertDialog;
-import android.content.ContentResolver;
-import android.content.ContentValues;
-import android.content.Intent;
-import android.content.pm.PackageManager;
-import android.net.Uri;
-import android.os.Build;
-import android.os.Bundle;
-import android.os.Handler;
-import android.os.Looper;
-import android.os.Process;
-import android.provider.MediaStore;
-import android.system.ErrnoException;
-import android.system.Os;
-import android.util.Log;
-import android.view.View;
-import android.view.inputmethod.InputMethodManager;
-import android.widget.EditText;
-import android.widget.ImageButton;
-import android.widget.ImageView;
-import android.widget.LinearLayout;
-import android.widget.ListView;
-import android.widget.TextView;
-import android.widget.Toast;
-import androidx.activity.result.ActivityResultLauncher;
-import androidx.activity.result.PickVisualMediaRequest;
-import androidx.activity.result.contract.ActivityResultContracts;
-import androidx.annotation.NonNull;
-import androidx.appcompat.app.AppCompatActivity;
-import androidx.constraintlayout.widget.ConstraintLayout;
-import androidx.core.app.ActivityCompat;
-import androidx.core.content.ContextCompat;
-import androidx.core.content.res.ResourcesCompat;
-import com.google.gson.Gson;
-import com.google.gson.reflect.TypeToken;
-import java.lang.reflect.Type;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Executor;
-import java.util.concurrent.Executors;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.pytorch.executorch.extension.llm.LlmCallback;
-import org.pytorch.executorch.extension.llm.LlmModule;
-
-public class MainActivity extends AppCompatActivity implements Runnable, LlmCallback {
-  private EditText mEditTextMessage;
-  private ImageButton mThinkModeButton;
-  private ImageButton mSendButton;
-  private ImageButton mGalleryButton;
-  private ImageButton mCameraButton;
-  private ListView mMessagesView;
-  private MessageAdapter mMessageAdapter;
-  private LlmModule mModule = null;
-  private Message mResultMessage = null;
-  private ImageButton mSettingsButton;
-  private TextView mMemoryView;
-  private ActivityResultLauncher<PickVisualMediaRequest> mPickGallery;
-  private ActivityResultLauncher<Uri> mCameraRoll;
-  private List<Uri> mSelectedImageUri;
-  private ConstraintLayout mMediaPreviewConstraintLayout;
-  private LinearLayout mAddMediaLayout;
-  private static final int MAX_NUM_OF_IMAGES = 5;
-  private static final int REQUEST_IMAGE_CAPTURE = 1;
-  private Uri cameraImageUri;
-  private DemoSharedPreferences mDemoSharedPreferences;
-  private SettingsFields mCurrentSettingsFields;
-  private Handler mMemoryUpdateHandler;
-  private Runnable memoryUpdater;
-  private boolean mThinkMode = false;
-  private int promptID = 0;
-  private static final int CONVERSATION_HISTORY_MESSAGE_LOOKBACK = 2;
-  private Executor executor;
-
-  @Override
-  public void onResult(String result) {
-    if (result.equals(PromptFormat.getStopToken(mCurrentSettingsFields.getModelType()))) {
-      return;
-    }
-    result = PromptFormat.replaceSpecialToken(mCurrentSettingsFields.getModelType(), result);
-    if (result.equals("\n\n") || result.equals("\n")) {
-      if (!mResultMessage.getText().isEmpty()) {
-        mResultMessage.appendText(result);
-        run();
-      }
-    } else {
-      mResultMessage.appendText(result);
-      run();
-    }
-  }
-
-  @Override
-  public void onStats(String stats) {
-    runOnUiThread(
-        () -> {
-          if (mResultMessage != null) {
-            float tps = 0;
-            try {
-              JSONObject jsonObject = new JSONObject(stats);
-              int numGeneratedTokens = jsonObject.getInt("generated_tokens");
-              int inferenceEndMs = jsonObject.getInt("inference_end_ms");
-              int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
-              tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
-            } catch (JSONException e) {
-              Log.e("LLM", "Error parsing JSON: " + e.getMessage());
-            }
-            mResultMessage.setTokensPerSecond(tps);
-            mMessageAdapter.notifyDataSetChanged();
-          }
-        });
-  }
-
-  private void setLocalModel(String modelPath, String tokenizerPath, float temperature) {
-    Message modelLoadingMessage = new Message("Loading model...", false, MessageType.SYSTEM, 0);
-    ETLogging.getInstance().log("Loading model " + modelPath + " with tokenizer " + tokenizerPath);
-    runOnUiThread(
-        () -> {
-          mSendButton.setEnabled(false);
-          mMessageAdapter.add(modelLoadingMessage);
-          mMessageAdapter.notifyDataSetChanged();
-        });
-    if (mModule != null) {
-      ETLogging.getInstance().log("Start deallocating existing module instance");
-      mModule.resetNative();
-      mModule = null;
-      ETLogging.getInstance().log("Completed deallocating existing module instance");
-    }
-    long runStartTime = System.currentTimeMillis();
-    mModule =
-        new LlmModule(
-            ModelUtils.getModelCategory(
-                mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType()),
-            modelPath,
-            tokenizerPath,
-            temperature);
-    int loadResult = mModule.load();
-    long loadDuration = System.currentTimeMillis() - runStartTime;
-    String modelLoadError = "";
-    String modelInfo = "";
-    if (loadResult != 0) {
-      // TODO: Map the error code to a reason to let the user know why model loading failed
-      modelInfo = "*Model could not load (Error Code: " + loadResult + ")*" + "\n";
-      loadDuration = 0;
-      AlertDialog.Builder builder = new AlertDialog.Builder(this);
-      builder.setTitle("Load failed: " + loadResult);
-      runOnUiThread(
-          () -> {
-            AlertDialog alert = builder.create();
-            alert.show();
-          });
-    } else {
-      String[] segments = modelPath.split("/");
-      String pteName = segments[segments.length - 1];
-      segments = tokenizerPath.split("/");
-      String tokenizerName = segments[segments.length - 1];
-      modelInfo =
-          "Successfully loaded model. "
-              + pteName
-              + " and tokenizer "
-              + tokenizerName
-              + " in "
-              + (float) loadDuration / 1000
-              + " sec."
-              + " You can send text or image for inference";
-
-      if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
-        ETLogging.getInstance().log("Llava start prefill prompt");
-        mModule.resetContext();
-        mModule.prefillPrompt(PromptFormat.getLlavaPresetPrompt());
-        ETLogging.getInstance().log("Llava completes prefill prompt");
-      }
-    }
-
-    Message modelLoadedMessage = new Message(modelInfo, false, MessageType.SYSTEM, 0);
-
-    String modelLoggingInfo =
-        modelLoadError
-            + "Model path: "
-            + modelPath
-            + "\nTokenizer path: "
-            + tokenizerPath
-            + "\nBackend: "
-            + mCurrentSettingsFields.getBackendType().toString()
-            + "\nModelType: "
-            + ModelUtils.getModelCategory(
-                mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
-            + "\nTemperature: "
-            + temperature
-            + "\nModel loaded time: "
-            + loadDuration
-            + " ms";
-    ETLogging.getInstance().log("Load complete. " + modelLoggingInfo);
-
-    runOnUiThread(
-        () -> {
-          mSendButton.setEnabled(true);
-          mMessageAdapter.remove(modelLoadingMessage);
-          mMessageAdapter.add(modelLoadedMessage);
-          mMessageAdapter.notifyDataSetChanged();
-        });
-  }
-
-  private void loadLocalModelAndParameters(
-      String modelFilePath, String tokenizerFilePath, float temperature) {
-    Runnable runnable =
-        new Runnable() {
-          @Override
-          public void run() {
-            setLocalModel(modelFilePath, tokenizerFilePath, temperature);
-          }
-        };
-    new Thread(runnable).start();
-  }
-
-  private void populateExistingMessages(String existingMsgJSON) {
-    Gson gson = new Gson();
-    Type type = new TypeToken<ArrayList<Message>>() {}.getType();
-    ArrayList<Message> savedMessages = gson.fromJson(existingMsgJSON, type);
-    for (Message msg : savedMessages) {
-      mMessageAdapter.add(msg);
-    }
-    mMessageAdapter.notifyDataSetChanged();
-  }
-
-  private int setPromptID() {
-
-    return mMessageAdapter.getMaxPromptID() + 1;
-  }
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_main);
-
-    if (Build.VERSION.SDK_INT >= 21) {
-      getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar));
-      getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar));
-    }
-
-    try {
-      Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
-      Os.setenv("LD_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true);
-    } catch (ErrnoException e) {
-      finish();
-    }
-
-    mThinkModeButton = requireViewById(R.id.thinkModeButton);
-    mEditTextMessage = requireViewById(R.id.editTextMessage);
-    mSendButton = requireViewById(R.id.sendButton);
-    mSendButton.setEnabled(false);
-    mMessagesView = requireViewById(R.id.messages_view);
-    mMessageAdapter = new MessageAdapter(this, R.layout.sent_message, new ArrayList<Message>());
-    mMessagesView.setAdapter(mMessageAdapter);
-    mDemoSharedPreferences = new DemoSharedPreferences(this.getApplicationContext());
-    String existingMsgJSON = mDemoSharedPreferences.getSavedMessages();
-    if (!existingMsgJSON.isEmpty()) {
-      populateExistingMessages(existingMsgJSON);
-      promptID = setPromptID();
-    }
-    mSettingsButton = requireViewById(R.id.settings);
-    mSettingsButton.setOnClickListener(
-        view -> {
-          Intent myIntent = new Intent(MainActivity.this, SettingsActivity.class);
-          MainActivity.this.startActivity(myIntent);
-        });
-
-    mThinkModeButton.setOnClickListener(
-        view -> {
-          if (mThinkMode) {
-            mThinkMode = false;
-            mThinkModeButton.setImageDrawable(
-                ResourcesCompat.getDrawable(
-                    getResources(), R.drawable.baseline_lightbulb_24, null));
-          } else {
-            mThinkMode = true;
-            mThinkModeButton.setImageDrawable(
-                ResourcesCompat.getDrawable(getResources(), R.drawable.blue_lightbulb_24, null));
-          }
-          runOnUiThread(
-              () -> {
-                String thinkingModeText = mThinkMode ? "on" : "off";
-                mMessageAdapter.add(
-                    new Message(
-                        "Thinking mode is " + thinkingModeText, false, MessageType.SYSTEM, 0));
-                mMessageAdapter.notifyDataSetChanged();
-              });
-        });
-
-    mCurrentSettingsFields = new SettingsFields();
-    mMemoryUpdateHandler = new Handler(Looper.getMainLooper());
-    onModelRunStopped();
-    setupMediaButton();
-    setupGalleryPicker();
-    setupCameraRoll();
-    startMemoryUpdate();
-    setupShowLogsButton();
-    executor = Executors.newSingleThreadExecutor();
-  }
-
-  @Override
-  protected void onPause() {
-    super.onPause();
-    mDemoSharedPreferences.addMessages(mMessageAdapter);
-  }
-
-  @Override
-  protected void onResume() {
-    super.onResume();
-    // Check for if settings parameters have changed
-    Gson gson = new Gson();
-    String settingsFieldsJSON = mDemoSharedPreferences.getSettings();
-    if (!settingsFieldsJSON.isEmpty()) {
-      SettingsFields updatedSettingsFields =
-          gson.fromJson(settingsFieldsJSON, SettingsFields.class);
-      if (updatedSettingsFields == null) {
-        // Added this check, because gson.fromJson can return null
-        askUserToSelectModel();
-        return;
-      }
-      boolean isUpdated = !mCurrentSettingsFields.equals(updatedSettingsFields);
-      boolean isLoadModel = updatedSettingsFields.getIsLoadModel();
-      setBackendMode(updatedSettingsFields.getBackendType());
-      if (isUpdated) {
-        if (isLoadModel) {
-          // If users change the model file, but not pressing loadModelButton, we won't load the new
-          // model
-          checkForUpdateAndReloadModel(updatedSettingsFields);
-        } else {
-          askUserToSelectModel();
-        }
-
-        checkForClearChatHistory(updatedSettingsFields);
-        // Update current to point to the latest
-        mCurrentSettingsFields = new SettingsFields(updatedSettingsFields);
-      }
-    } else {
-      askUserToSelectModel();
-    }
-  }
-
-  private void setBackendMode(BackendType backendType) {
-    if (backendType.equals(BackendType.XNNPACK) || backendType.equals(BackendType.QUALCOMM)) {
-      setXNNPACKMode();
-    } else if (backendType.equals(BackendType.MEDIATEK)) {
-      setMediaTekMode();
-    }
-  }
-
-  private void setXNNPACKMode() {
-    requireViewById(R.id.addMediaButton).setVisibility(View.VISIBLE);
-  }
-
-  private void setMediaTekMode() {
-    requireViewById(R.id.addMediaButton).setVisibility(View.GONE);
-  }
-
-  private void checkForClearChatHistory(SettingsFields updatedSettingsFields) {
-    if (updatedSettingsFields.getIsClearChatHistory()) {
-      mMessageAdapter.clear();
-      mMessageAdapter.notifyDataSetChanged();
-      mDemoSharedPreferences.removeExistingMessages();
-      // changing to false since chat history has been cleared.
-      updatedSettingsFields.saveIsClearChatHistory(false);
-      mDemoSharedPreferences.addSettings(updatedSettingsFields);
-    }
-  }
-
-  private void checkForUpdateAndReloadModel(SettingsFields updatedSettingsFields) {
-    // TODO need to add 'load model' in settings and queue loading based on that
-    String modelPath = updatedSettingsFields.getModelFilePath();
-    String tokenizerPath = updatedSettingsFields.getTokenizerFilePath();
-    double temperature = updatedSettingsFields.getTemperature();
-    if (!modelPath.isEmpty() && !tokenizerPath.isEmpty()) {
-      if (updatedSettingsFields.getIsLoadModel()
-          || !modelPath.equals(mCurrentSettingsFields.getModelFilePath())
-          || !tokenizerPath.equals(mCurrentSettingsFields.getTokenizerFilePath())
-          || temperature != mCurrentSettingsFields.getTemperature()) {
-        loadLocalModelAndParameters(
-            updatedSettingsFields.getModelFilePath(),
-            updatedSettingsFields.getTokenizerFilePath(),
-            (float) updatedSettingsFields.getTemperature());
-        updatedSettingsFields.saveLoadModelAction(false);
-        mDemoSharedPreferences.addSettings(updatedSettingsFields);
-      }
-    } else {
-      askUserToSelectModel();
-    }
-  }
-
-  private void askUserToSelectModel() {
-    String askLoadModel =
-        "To get started, select your desired model and tokenizer " + "from the top right corner";
-    Message askLoadModelMessage = new Message(askLoadModel, false, MessageType.SYSTEM, 0);
-    ETLogging.getInstance().log(askLoadModel);
-    runOnUiThread(
-        () -> {
-          mMessageAdapter.add(askLoadModelMessage);
-          mMessageAdapter.notifyDataSetChanged();
-        });
-  }
-
-  private void setupShowLogsButton() {
-    ImageButton showLogsButton = requireViewById(R.id.showLogsButton);
-    showLogsButton.setOnClickListener(
-        view -> {
-          Intent myIntent = new Intent(MainActivity.this, LogsActivity.class);
-          MainActivity.this.startActivity(myIntent);
-        });
-  }
-
-  private void setupMediaButton() {
-    mAddMediaLayout = requireViewById(R.id.addMediaLayout);
-    mAddMediaLayout.setVisibility(View.GONE); // We hide this initially
-
-    ImageButton addMediaButton = requireViewById(R.id.addMediaButton);
-    addMediaButton.setOnClickListener(
-        view -> {
-          mAddMediaLayout.setVisibility(View.VISIBLE);
-        });
-
-    mGalleryButton = requireViewById(R.id.galleryButton);
-    mGalleryButton.setOnClickListener(
-        view -> {
-          // Launch the photo picker and let the user choose only images.
-          mPickGallery.launch(
-              new PickVisualMediaRequest.Builder()
-                  .setMediaType(ActivityResultContracts.PickVisualMedia.ImageOnly.INSTANCE)
-                  .build());
-        });
-    mCameraButton = requireViewById(R.id.cameraButton);
-    mCameraButton.setOnClickListener(
-        view -> {
-          Log.d("CameraRoll", "Check permission");
-          if (ContextCompat.checkSelfPermission(MainActivity.this, Manifest.permission.CAMERA)
-              != PackageManager.PERMISSION_GRANTED) {
-            ActivityCompat.requestPermissions(
-                MainActivity.this,
-                new String[] {Manifest.permission.CAMERA},
-                REQUEST_IMAGE_CAPTURE);
-          } else {
-            launchCamera();
-          }
-        });
-  }
-
-  private void setupCameraRoll() {
-    // Registers a camera roll activity launcher.
-    mCameraRoll =
-        registerForActivityResult(
-            new ActivityResultContracts.TakePicture(),
-            result -> {
-              if (result && cameraImageUri != null) {
-                Log.d("CameraRoll", "Photo saved to uri: " + cameraImageUri);
-                mAddMediaLayout.setVisibility(View.GONE);
-                List<Uri> uris = new ArrayList<>();
-                uris.add(cameraImageUri);
-                showMediaPreview(uris);
-              } else {
-                // Delete the temp image file based on the url since the photo is not successfully
-                // taken
-                if (cameraImageUri != null) {
-                  ContentResolver contentResolver = MainActivity.this.getContentResolver();
-                  contentResolver.delete(cameraImageUri, null, null);
-                  Log.d("CameraRoll", "No photo taken. Delete temp uri");
-                }
-              }
-            });
-    mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout);
-    ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton);
-    mediaPreviewCloseButton.setOnClickListener(
-        view -> {
-          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
-          mSelectedImageUri = null;
-        });
-
-    ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton);
-    addMoreImageButton.setOnClickListener(
-        view -> {
-          Log.d("addMore", "clicked");
-          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
-          // Direct user to select type of input
-          mCameraButton.callOnClick();
-        });
-  }
-
-  private String updateMemoryUsage() {
-    ActivityManager.MemoryInfo memoryInfo = new ActivityManager.MemoryInfo();
-    ActivityManager activityManager = (ActivityManager) getSystemService(ACTIVITY_SERVICE);
-    if (activityManager == null) {
-      return "---";
-    }
-    activityManager.getMemoryInfo(memoryInfo);
-    long totalMem = memoryInfo.totalMem / (1024 * 1024);
-    long availableMem = memoryInfo.availMem / (1024 * 1024);
-    long usedMem = totalMem - availableMem;
-    return usedMem + "MB";
-  }
-
-  private void startMemoryUpdate() {
-    mMemoryView = requireViewById(R.id.ram_usage_live);
-    memoryUpdater =
-        new Runnable() {
-          @Override
-          public void run() {
-            mMemoryView.setText(updateMemoryUsage());
-            mMemoryUpdateHandler.postDelayed(this, 1000);
-          }
-        };
-    mMemoryUpdateHandler.post(memoryUpdater);
-  }
-
-  @Override
-  public void onRequestPermissionsResult(
-      int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) {
-    super.onRequestPermissionsResult(requestCode, permissions, grantResults);
-    if (requestCode == REQUEST_IMAGE_CAPTURE && grantResults.length != 0) {
-      if (grantResults[0] == PackageManager.PERMISSION_GRANTED) {
-        launchCamera();
-      } else if (grantResults[0] == PackageManager.PERMISSION_DENIED) {
-        Log.d("CameraRoll", "Permission denied");
-      }
-    }
-  }
-
-  private void launchCamera() {
-    ContentValues values = new ContentValues();
-    values.put(MediaStore.Images.Media.TITLE, "New Picture");
-    values.put(MediaStore.Images.Media.DESCRIPTION, "From Camera");
-    values.put(MediaStore.Images.Media.RELATIVE_PATH, "DCIM/Camera/");
-    cameraImageUri =
-        MainActivity.this
-            .getContentResolver()
-            .insert(MediaStore.Images.Media.EXTERNAL_CONTENT_URI, values);
-    mCameraRoll.launch(cameraImageUri);
-  }
-
-  private void setupGalleryPicker() {
-    // Registers a photo picker activity launcher in single-select mode.
-    mPickGallery =
-        registerForActivityResult(
-            new ActivityResultContracts.PickMultipleVisualMedia(MAX_NUM_OF_IMAGES),
-            uris -> {
-              if (!uris.isEmpty()) {
-                Log.d("PhotoPicker", "Selected URIs: " + uris);
-                mAddMediaLayout.setVisibility(View.GONE);
-                for (Uri uri : uris) {
-                  MainActivity.this
-                      .getContentResolver()
-                      .takePersistableUriPermission(uri, Intent.FLAG_GRANT_READ_URI_PERMISSION);
-                }
-                showMediaPreview(uris);
-              } else {
-                Log.d("PhotoPicker", "No media selected");
-              }
-            });
-
-    mMediaPreviewConstraintLayout = requireViewById(R.id.mediaPreviewConstraintLayout);
-    ImageButton mediaPreviewCloseButton = requireViewById(R.id.mediaPreviewCloseButton);
-    mediaPreviewCloseButton.setOnClickListener(
-        view -> {
-          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
-          mSelectedImageUri = null;
-        });
-
-    ImageButton addMoreImageButton = requireViewById(R.id.addMoreImageButton);
-    addMoreImageButton.setOnClickListener(
-        view -> {
-          Log.d("addMore", "clicked");
-          mMediaPreviewConstraintLayout.setVisibility(View.GONE);
-          mGalleryButton.callOnClick();
-        });
-  }
-
-  private List<ETImage> getProcessedImagesForModel(List<Uri> uris) {
-    List<ETImage> imageList = new ArrayList<>();
-    if (uris != null) {
-      uris.forEach(
-          (uri) -> {
-            imageList.add(new ETImage(this.getContentResolver(), uri));
-          });
-    }
-    return imageList;
-  }
-
-  private void showMediaPreview(List<Uri> uris) {
-    if (mSelectedImageUri == null) {
-      mSelectedImageUri = uris;
-    } else {
-      mSelectedImageUri.addAll(uris);
-    }
-
-    if (mSelectedImageUri.size() > MAX_NUM_OF_IMAGES) {
-      mSelectedImageUri = mSelectedImageUri.subList(0, MAX_NUM_OF_IMAGES);
-      Toast.makeText(
-              this, "Only max " + MAX_NUM_OF_IMAGES + " images are allowed", Toast.LENGTH_SHORT)
-          .show();
-    }
-    Log.d("mSelectedImageUri", mSelectedImageUri.size() + " " + mSelectedImageUri);
-
-    mMediaPreviewConstraintLayout.setVisibility(View.VISIBLE);
-
-    List<ImageView> imageViews = new ArrayList<ImageView>();
-
-    // Pre-populate all the image views that are available from the layout (currently max 5)
-    imageViews.add(requireViewById(R.id.mediaPreviewImageView1));
-    imageViews.add(requireViewById(R.id.mediaPreviewImageView2));
-    imageViews.add(requireViewById(R.id.mediaPreviewImageView3));
-    imageViews.add(requireViewById(R.id.mediaPreviewImageView4));
-    imageViews.add(requireViewById(R.id.mediaPreviewImageView5));
-
-    // Hide all the image views (reset state)
-    for (int i = 0; i < imageViews.size(); i++) {
-      imageViews.get(i).setVisibility(View.GONE);
-    }
-
-    // Only show/render those that have proper Image URIs
-    for (int i = 0; i < mSelectedImageUri.size(); i++) {
-      imageViews.get(i).setVisibility(View.VISIBLE);
-      imageViews.get(i).setImageURI(mSelectedImageUri.get(i));
-    }
-
-    // For LLava, we want to call prefill_image as soon as an image is selected
-    // Llava only support 1 image for now
-    if (mCurrentSettingsFields.getModelType() == ModelType.LLAVA_1_5) {
-      List<ETImage> processedImageList = getProcessedImagesForModel(mSelectedImageUri);
-      if (!processedImageList.isEmpty()) {
-        mMessageAdapter.add(
-            new Message("Llava - Starting image Prefill.", false, MessageType.SYSTEM, 0));
-        mMessageAdapter.notifyDataSetChanged();
-        Runnable runnable =
-            () -> {
-              Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE);
-              ETLogging.getInstance().log("Starting runnable prefill image");
-              ETImage img = processedImageList.get(0);
-              ETLogging.getInstance().log("Llava start prefill image");
-              mModule.prefillImages(
-                  img.getInts(),
-                  img.getWidth(),
-                  img.getHeight(),
-                  ModelUtils.VISION_MODEL_IMAGE_CHANNELS);
-            };
-        executor.execute(runnable);
-      }
-    }
-  }
-
-  private void addSelectedImagesToChatThread(List<Uri> selectedImageUri) {
-    if (selectedImageUri == null) {
-      return;
-    }
-    mMediaPreviewConstraintLayout.setVisibility(View.GONE);
-    for (int i = 0; i < selectedImageUri.size(); i++) {
-      Uri imageURI = selectedImageUri.get(i);
-      Log.d("image uri ", "test " + imageURI.getPath());
-      mMessageAdapter.add(new Message(imageURI.toString(), true, MessageType.IMAGE, 0));
-    }
-    mMessageAdapter.notifyDataSetChanged();
-  }
-
-  private String getConversationHistory() {
-    String conversationHistory = "";
-
-    ArrayList<Message> conversations =
-        mMessageAdapter.getRecentSavedTextMessages(CONVERSATION_HISTORY_MESSAGE_LOOKBACK);
-    if (conversations.isEmpty()) {
-      return conversationHistory;
-    }
-
-    int prevPromptID = conversations.get(0).getPromptID();
-    String conversationFormat =
-        PromptFormat.getConversationFormat(mCurrentSettingsFields.getModelType());
-    String format = conversationFormat;
-    for (int i = 0; i < conversations.size(); i++) {
-      Message conversation = conversations.get(i);
-      int currentPromptID = conversation.getPromptID();
-      if (currentPromptID != prevPromptID) {
-        conversationHistory = conversationHistory + format;
-        format = conversationFormat;
-        prevPromptID = currentPromptID;
-      }
-      if (conversation.getIsSent()) {
-        format =
-            format
-                .replace(PromptFormat.USER_PLACEHOLDER, conversation.getText())
-                .replace(PromptFormat.THINKING_MODE_PLACEHOLDER, "");
-      } else {
-        format = format.replace(PromptFormat.ASSISTANT_PLACEHOLDER, conversation.getText());
-      }
-    }
-    conversationHistory = conversationHistory + format;
-
-    return conversationHistory;
-  }
-
-  private String getTotalFormattedPrompt(String conversationHistory, String rawPrompt) {
-    if (conversationHistory.isEmpty()) {
-      return mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode);
-    }
-
-    return mCurrentSettingsFields.getFormattedSystemPrompt()
-        + conversationHistory
-        + mCurrentSettingsFields.getFormattedUserPrompt(rawPrompt, mThinkMode);
-  }
-
-  private void onModelRunStarted() {
-    mSendButton.setClickable(false);
-    mSendButton.setImageResource(R.drawable.baseline_stop_24);
-    mSendButton.setOnClickListener(
-        view -> {
-          mModule.stop();
-        });
-  }
-
-  private void onModelRunStopped() {
-    mSendButton.setClickable(true);
-    mSendButton.setImageResource(R.drawable.baseline_send_24);
-    mSendButton.setOnClickListener(
-        view -> {
-          try {
-            InputMethodManager imm = (InputMethodManager) getSystemService(INPUT_METHOD_SERVICE);
-            imm.hideSoftInputFromWindow(getCurrentFocus().getWindowToken(), 0);
-          } catch (Exception e) {
-            ETLogging.getInstance().log("Keyboard dismissal error: " + e.getMessage());
-          }
-          addSelectedImagesToChatThread(mSelectedImageUri);
-          String finalPrompt;
-          String rawPrompt = mEditTextMessage.getText().toString();
-          if (ModelUtils.getModelCategory(
-                  mCurrentSettingsFields.getModelType(), mCurrentSettingsFields.getBackendType())
-              == ModelUtils.VISION_MODEL) {
-            finalPrompt =
-                mCurrentSettingsFields.getFormattedSystemAndUserPrompt(rawPrompt, mThinkMode);
-          } else {
-            finalPrompt = getTotalFormattedPrompt(getConversationHistory(), rawPrompt);
-          }
-          // We store raw prompt into message adapter, because we don't want to show the extra
-          // tokens from system prompt
-          mMessageAdapter.add(new Message(rawPrompt, true, MessageType.TEXT, promptID));
-          mMessageAdapter.notifyDataSetChanged();
-          mEditTextMessage.setText("");
-          mResultMessage = new Message("", false, MessageType.TEXT, promptID);
-          mMessageAdapter.add(mResultMessage);
-          // Scroll to bottom of the list
-          mMessagesView.smoothScrollToPosition(mMessageAdapter.getCount() - 1);
-          // After images are added to prompt and chat thread, we clear the imageURI list
-          // Note: This has to be done after imageURIs are no longer needed by LlmModule
-          mSelectedImageUri = null;
-          promptID++;
-          Runnable runnable =
-              new Runnable() {
-                @Override
-                public void run() {
-                  Process.setThreadPriority(Process.THREAD_PRIORITY_MORE_FAVORABLE);
-                  ETLogging.getInstance().log("starting runnable generate()");
-                  runOnUiThread(
-                      new Runnable() {
-                        @Override
-                        public void run() {
-                          onModelRunStarted();
-                        }
-                      });
-                  long generateStartTime = System.currentTimeMillis();
-                  if (ModelUtils.getModelCategory(
-                          mCurrentSettingsFields.getModelType(),
-                          mCurrentSettingsFields.getBackendType())
-                      == ModelUtils.VISION_MODEL) {
-                    mModule.generate(
-                        finalPrompt, ModelUtils.VISION_MODEL_SEQ_LEN, MainActivity.this, false);
-                  } else if (mCurrentSettingsFields.getModelType() == ModelType.LLAMA_GUARD_3) {
-                    String llamaGuardPromptForClassification =
-                        PromptFormat.getFormattedLlamaGuardPrompt(rawPrompt);
-                    ETLogging.getInstance()
-                        .log("Running inference.. prompt=" + llamaGuardPromptForClassification);
-                    mModule.generate(
-                        llamaGuardPromptForClassification,
-                        llamaGuardPromptForClassification.length() + 64,
-                        MainActivity.this,
-                        false);
-                  } else {
-                    ETLogging.getInstance().log("Running inference.. prompt=" + finalPrompt);
-                    mModule.generate(
-                        finalPrompt,
-                        (int) (finalPrompt.length() * 0.75) + 64,
-                        MainActivity.this,
-                        false);
-                  }
-
-                  long generateDuration = System.currentTimeMillis() - generateStartTime;
-                  mResultMessage.setTotalGenerationTime(generateDuration);
-                  runOnUiThread(
-                      new Runnable() {
-                        @Override
-                        public void run() {
-                          onModelRunStopped();
-                        }
-                      });
-                  ETLogging.getInstance().log("Inference completed");
-                }
-              };
-          executor.execute(runnable);
-        });
-    mMessageAdapter.notifyDataSetChanged();
-  }
-
-  @Override
-  public void run() {
-    runOnUiThread(
-        new Runnable() {
-          @Override
-          public void run() {
-            mMessageAdapter.notifyDataSetChanged();
-          }
-        });
-  }
-
-  @Override
-  public void onBackPressed() {
-    super.onBackPressed();
-    if (mAddMediaLayout != null && mAddMediaLayout.getVisibility() == View.VISIBLE) {
-      mAddMediaLayout.setVisibility(View.GONE);
-    } else {
-      // Default behavior of back button
-      finish();
-    }
-  }
-
-  @Override
-  protected void onDestroy() {
-    super.onDestroy();
-    mMemoryUpdateHandler.removeCallbacks(memoryUpdater);
-    // This is to cover the case where the app is shutdown when user is on MainActivity but
-    // never clicked on the logsActivity
-    ETLogging.getInstance().saveLogs();
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
deleted file mode 100644
index b2e5380e2a5..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/Message.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
-
-public class Message {
-  private String text;
-  private final boolean isSent;
-  private float tokensPerSecond;
-  private long totalGenerationTime;
-  private final long timestamp;
-  private final MessageType messageType;
-  private String imagePath;
-  private final int promptID;
-
-  private static final String TIMESTAMP_FORMAT = "hh:mm a"; // example: 2:23 PM
-
-  public Message(String text, boolean isSent, MessageType messageType, int promptID) {
-    this.isSent = isSent;
-    this.messageType = messageType;
-    this.promptID = promptID;
-
-    if (messageType == MessageType.IMAGE) {
-      this.imagePath = text;
-    } else {
-      this.text = text;
-    }
-
-    if (messageType != MessageType.SYSTEM) {
-      this.timestamp = System.currentTimeMillis();
-    } else {
-      this.timestamp = (long) 0;
-    }
-  }
-
-  public int getPromptID() {
-    return promptID;
-  }
-
-  public MessageType getMessageType() {
-    return messageType;
-  }
-
-  public String getImagePath() {
-    return imagePath;
-  }
-
-  public String getText() {
-    return text;
-  }
-
-  public void appendText(String text) {
-    this.text += text;
-  }
-
-  public boolean getIsSent() {
-    return isSent;
-  }
-
-  public void setTokensPerSecond(float tokensPerSecond) {
-    this.tokensPerSecond = tokensPerSecond;
-  }
-
-  public void setTotalGenerationTime(long totalGenerationTime) {
-    this.totalGenerationTime = totalGenerationTime;
-  }
-
-  public float getTokensPerSecond() {
-    return tokensPerSecond;
-  }
-
-  public long getTotalGenerationTime() {
-    return totalGenerationTime;
-  }
-
-  public long getTimestamp() {
-    return timestamp;
-  }
-
-  public String getFormattedTimestamp() {
-    SimpleDateFormat formatter = new SimpleDateFormat(TIMESTAMP_FORMAT, Locale.getDefault());
-    Date date = new Date(timestamp);
-    return formatter.format(date);
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
deleted file mode 100644
index 31aaa9a1d5f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageAdapter.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.net.Uri;
-import android.view.LayoutInflater;
-import android.view.View;
-import android.view.ViewGroup;
-import android.widget.ArrayAdapter;
-import android.widget.ImageView;
-import android.widget.TextView;
-import java.util.ArrayList;
-import java.util.Collections;
-
-public class MessageAdapter extends ArrayAdapter<Message> {
-
-  private final ArrayList<Message> savedMessages;
-
-  public MessageAdapter(
-      android.content.Context context, int resource, ArrayList<Message> savedMessages) {
-    super(context, resource);
-    this.savedMessages = savedMessages;
-  }
-
-  @Override
-  public View getView(int position, View convertView, ViewGroup parent) {
-    Message currentMessage = getItem(position);
-    int layoutIdForListItem;
-
-    if (currentMessage.getMessageType() == MessageType.SYSTEM) {
-      layoutIdForListItem = R.layout.system_message;
-    } else {
-      layoutIdForListItem =
-          currentMessage.getIsSent() ? R.layout.sent_message : R.layout.received_message;
-    }
-    View listItemView =
-        LayoutInflater.from(getContext()).inflate(layoutIdForListItem, parent, false);
-    if (currentMessage.getMessageType() == MessageType.IMAGE) {
-      ImageView messageImageView = listItemView.requireViewById(R.id.message_image);
-      messageImageView.setImageURI(Uri.parse(currentMessage.getImagePath()));
-      TextView messageTextView = listItemView.requireViewById(R.id.message_text);
-      messageTextView.setVisibility(View.GONE);
-    } else {
-      TextView messageTextView = listItemView.requireViewById(R.id.message_text);
-      messageTextView.setText(currentMessage.getText());
-    }
-
-    String metrics = "";
-    TextView tokensView;
-    if (currentMessage.getTokensPerSecond() > 0) {
-      metrics = String.format("%.2f", currentMessage.getTokensPerSecond()) + "t/s  ";
-    }
-
-    if (currentMessage.getTotalGenerationTime() > 0) {
-      metrics = metrics + (float) currentMessage.getTotalGenerationTime() / 1000 + "s  ";
-    }
-
-    if (currentMessage.getTokensPerSecond() > 0 || currentMessage.getTotalGenerationTime() > 0) {
-      tokensView = listItemView.requireViewById(R.id.generation_metrics);
-      tokensView.setText(metrics);
-      TextView separatorView = listItemView.requireViewById(R.id.bar);
-      separatorView.setVisibility(View.VISIBLE);
-    }
-
-    if (currentMessage.getTimestamp() > 0) {
-      TextView timestampView = listItemView.requireViewById(R.id.timestamp);
-      timestampView.setText(currentMessage.getFormattedTimestamp());
-    }
-
-    return listItemView;
-  }
-
-  @Override
-  public void add(Message msg) {
-    super.add(msg);
-    savedMessages.add(msg);
-  }
-
-  @Override
-  public void clear() {
-    super.clear();
-    savedMessages.clear();
-  }
-
-  public ArrayList<Message> getSavedMessages() {
-    return savedMessages;
-  }
-
-  public ArrayList<Message> getRecentSavedTextMessages(int numOfLatestPromptMessages) {
-    ArrayList<Message> recentMessages = new ArrayList<Message>();
-    int lastIndex = savedMessages.size() - 1;
-    // In most cases lastIndex >=0 .
-    // A situation where the user clears chat history and enters prompt. Causes lastIndex=-1 .
-    if (lastIndex >= 0) {
-      Message messageToAdd = savedMessages.get(lastIndex);
-      int oldPromptID = messageToAdd.getPromptID();
-
-      for (int i = 0; i < savedMessages.size(); i++) {
-        messageToAdd = savedMessages.get(lastIndex - i);
-        if (messageToAdd.getMessageType() != MessageType.SYSTEM) {
-          if (messageToAdd.getPromptID() != oldPromptID) {
-            numOfLatestPromptMessages--;
-            oldPromptID = messageToAdd.getPromptID();
-          }
-          if (numOfLatestPromptMessages > 0) {
-            if (messageToAdd.getMessageType() == MessageType.TEXT) {
-              recentMessages.add(messageToAdd);
-            }
-          } else {
-            break;
-          }
-        }
-      }
-      // To place the order in [input1, output1, input2, output2...]
-      Collections.reverse(recentMessages);
-    }
-
-    return recentMessages;
-  }
-
-  public int getMaxPromptID() {
-    int maxPromptID = -1;
-    for (Message msg : savedMessages) {
-
-      maxPromptID = Math.max(msg.getPromptID(), maxPromptID);
-    }
-    return maxPromptID;
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java
deleted file mode 100644
index 6042acb5726..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/MessageType.java
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-public enum MessageType {
-  TEXT,
-  IMAGE,
-  SYSTEM
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
deleted file mode 100644
index a1bc205c4ac..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunner.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.os.Handler;
-import android.os.HandlerThread;
-import android.os.Looper;
-import android.os.Message;
-import androidx.annotation.NonNull;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.pytorch.executorch.extension.llm.LlmCallback;
-import org.pytorch.executorch.extension.llm.LlmModule;
-
-/** A helper class to handle all model running logic within this class. */
-public class ModelRunner implements LlmCallback {
-  LlmModule mModule = null;
-
-  String mModelFilePath = "";
-  String mTokenizerFilePath = "";
-
-  ModelRunnerCallback mCallback = null;
-
-  HandlerThread mHandlerThread = null;
-  Handler mHandler = null;
-
-  /**
-   * ] Helper class to separate between UI logic and model runner logic. Automatically handle
-   * generate() request on worker thread.
-   *
-   * @param modelFilePath
-   * @param tokenizerFilePath
-   * @param callback
-   */
-  ModelRunner(
-      String modelFilePath,
-      String tokenizerFilePath,
-      float temperature,
-      ModelRunnerCallback callback) {
-    mModelFilePath = modelFilePath;
-    mTokenizerFilePath = tokenizerFilePath;
-    mCallback = callback;
-
-    mModule = new LlmModule(mModelFilePath, mTokenizerFilePath, 0.8f);
-    mHandlerThread = new HandlerThread("ModelRunner");
-    mHandlerThread.start();
-    mHandler = new ModelRunnerHandler(mHandlerThread.getLooper(), this);
-
-    mHandler.sendEmptyMessage(ModelRunnerHandler.MESSAGE_LOAD_MODEL);
-  }
-
-  int generate(String prompt) {
-    Message msg = Message.obtain(mHandler, ModelRunnerHandler.MESSAGE_GENERATE, prompt);
-    msg.sendToTarget();
-    return 0;
-  }
-
-  void stop() {
-    mModule.stop();
-  }
-
-  @Override
-  public void onResult(String result) {
-    mCallback.onTokenGenerated(result);
-  }
-
-  @Override
-  public void onStats(String stats) {
-    float tps = 0;
-    try {
-      JSONObject jsonObject = new JSONObject(stats);
-      int numGeneratedTokens = jsonObject.getInt("generated_tokens");
-      int inferenceEndMs = jsonObject.getInt("inference_end_ms");
-      int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms");
-      tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000;
-    } catch (JSONException e) {
-    }
-    mCallback.onStats("tokens/second: " + tps);
-  }
-}
-
-class ModelRunnerHandler extends Handler {
-  public static int MESSAGE_LOAD_MODEL = 1;
-  public static int MESSAGE_GENERATE = 2;
-
-  private final ModelRunner mModelRunner;
-
-  public ModelRunnerHandler(Looper looper, ModelRunner modelRunner) {
-    super(looper);
-    mModelRunner = modelRunner;
-  }
-
-  @Override
-  public void handleMessage(@NonNull android.os.Message msg) {
-    if (msg.what == MESSAGE_LOAD_MODEL) {
-      int status = mModelRunner.mModule.load();
-      mModelRunner.mCallback.onModelLoaded(status);
-    } else if (msg.what == MESSAGE_GENERATE) {
-      mModelRunner.mModule.generate((String) msg.obj, mModelRunner);
-      mModelRunner.mCallback.onGenerationStopped();
-    }
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
deleted file mode 100644
index 5e8b6f00e3d..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelRunnerCallback.java
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-/**
- * A helper interface within the app for MainActivity and Benchmarking to handle callback from
- * ModelRunner.
- */
-public interface ModelRunnerCallback {
-
-  void onModelLoaded(int status);
-
-  void onTokenGenerated(String token);
-
-  void onStats(String stats);
-
-  void onGenerationStopped();
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
deleted file mode 100644
index 9f8132504ea..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelType.java
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-public enum ModelType {
-  LLAMA_3,
-  LLAMA_3_1,
-  LLAMA_3_2,
-  LLAVA_1_5,
-  LLAMA_GUARD_3,
-  QWEN_3,
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
deleted file mode 100644
index cf7ab1756ce..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/ModelUtils.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-public class ModelUtils {
-  // XNNPACK or QNN
-  static final int TEXT_MODEL = 1;
-
-  // XNNPACK
-  static final int VISION_MODEL = 2;
-  static final int VISION_MODEL_IMAGE_CHANNELS = 3;
-  static final int VISION_MODEL_SEQ_LEN = 768;
-  static final int TEXT_MODEL_SEQ_LEN = 256;
-
-  // MediaTek
-  static final int MEDIATEK_TEXT_MODEL = 3;
-
-  // QNN static llama
-  static final int QNN_TEXT_MODEL = 4;
-
-  public static int getModelCategory(ModelType modelType, BackendType backendType) {
-    if (backendType.equals(BackendType.XNNPACK)) {
-      switch (modelType) {
-        case LLAVA_1_5:
-          return VISION_MODEL;
-        case LLAMA_3:
-        case LLAMA_3_1:
-        case LLAMA_3_2:
-        case QWEN_3:
-        default:
-          return TEXT_MODEL;
-      }
-    } else if (backendType.equals(BackendType.MEDIATEK)) {
-      return MEDIATEK_TEXT_MODEL;
-    } else if (backendType.equals(BackendType.QUALCOMM)) {
-      return QNN_TEXT_MODEL;
-    }
-
-    return TEXT_MODEL; // default
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
deleted file mode 100644
index 524ad7cbf6d..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/PromptFormat.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-public class PromptFormat {
-
-  public static final String SYSTEM_PLACEHOLDER = "{{ system_prompt }}";
-  public static final String USER_PLACEHOLDER = "{{ user_prompt }}";
-  public static final String ASSISTANT_PLACEHOLDER = "{{ assistant_response }}";
-  public static final String THINKING_MODE_PLACEHOLDER = "{{ thinking_mode }}";
-  public static final String DEFAULT_SYSTEM_PROMPT = "Answer the questions in a few sentences";
-
-  public static String getSystemPromptTemplate(ModelType modelType) {
-    switch (modelType) {
-      case LLAMA_3:
-      case LLAMA_3_1:
-      case LLAMA_3_2:
-        return "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
-            + SYSTEM_PLACEHOLDER
-            + "<|eot_id|>";
-      case LLAVA_1_5:
-        return "USER: ";
-      case QWEN_3:
-        return "<|im_start|>system\n" + "You are a helpful assistant.\n" + "<|im_end|>\n";
-      default:
-        return SYSTEM_PLACEHOLDER;
-    }
-  }
-
-  public static String getUserPromptTemplate(ModelType modelType, boolean thinkingMode) {
-    switch (modelType) {
-      case LLAMA_3:
-      case LLAMA_3_1:
-      case LLAMA_3_2:
-      case LLAMA_GUARD_3:
-        return "<|start_header_id|>user<|end_header_id|>\n"
-            + USER_PLACEHOLDER
-            + "<|eot_id|>"
-            + "<|start_header_id|>assistant<|end_header_id|>";
-
-      case QWEN_3:
-        return "<|im_start|>user\n"
-            + USER_PLACEHOLDER
-            + "\n<|im_end|>\n"
-            + "<|im_start|>assistant\n"
-            + THINKING_MODE_PLACEHOLDER;
-      case LLAVA_1_5:
-      default:
-        return USER_PLACEHOLDER;
-    }
-  }
-
-  public static String getConversationFormat(ModelType modelType) {
-    switch (modelType) {
-      case LLAMA_3:
-      case LLAMA_3_1:
-      case LLAMA_3_2:
-        return getUserPromptTemplate(modelType, false)
-            + "\n"
-            + ASSISTANT_PLACEHOLDER
-            + "<|eot_id|>";
-      case LLAVA_1_5:
-        return USER_PLACEHOLDER + " ASSISTANT:";
-      case QWEN_3:
-        return getUserPromptTemplate(modelType, false) + "<|im_end|>\n";
-      default:
-        return USER_PLACEHOLDER;
-    }
-  }
-
-  public static String getStopToken(ModelType modelType) {
-    switch (modelType) {
-      case LLAMA_3:
-      case LLAMA_3_1:
-      case LLAMA_3_2:
-      case LLAMA_GUARD_3:
-        return "<|eot_id|>";
-      case LLAVA_1_5:
-        return "</s>";
-      case QWEN_3:
-        return "<|endoftext|>";
-      default:
-        return "";
-    }
-  }
-
-  public static String getThinkingModeToken(ModelType modelType, boolean thinkingMode) {
-    switch (modelType) {
-      case QWEN_3:
-        return thinkingMode ? "" : "<think>\n\n</think>\n\n\n";
-      default:
-        return "";
-    }
-  }
-
-  public static String replaceSpecialToken(ModelType modelType, String token) {
-    switch (modelType) {
-      case QWEN_3:
-        switch (token) {
-          case "<|im_end|>":
-            return "";
-          case "<think>":
-            return "Thinking...\n";
-          case "</think>":
-            return "\nDone thinking";
-        }
-      default:
-        return token;
-    }
-  }
-
-  public static String getLlavaPresetPrompt() {
-    return "A chat between a curious human and an artificial intelligence assistant. The assistant"
-        + " gives helpful, detailed, and polite answers to the human's questions. USER: ";
-  }
-
-  public static String getFormattedLlamaGuardPrompt(String userPrompt) {
-    return getUserPromptTemplate(ModelType.LLAMA_GUARD_3, false)
-        .replace(
-            USER_PLACEHOLDER, getLlamaGuardPresetPrompt().replace(USER_PLACEHOLDER, userPrompt));
-  }
-
-  public static String getLlamaGuardPresetPrompt() {
-    return "Task: Check if there is unsafe content in 'User' messages in conversations"
-        + " according our safety policy with the below categories.\n"
-        + "\n"
-        + "<BEGIN UNSAFE CONTENT CATEGORIES>\n"
-        + "S1: Violent Crimes. \n"
-        + "S2: Non-Violent Crimes. \n"
-        + "S3: Sex Crimes. \n"
-        + "S4: Child Exploitation. \n"
-        + "S5: Defamation. \n"
-        + "S6: Specialized Advice. \n"
-        + "S7: Privacy. \n"
-        + "S8: Intellectual Property. \n"
-        + "S9: Indiscriminate Weapons. \n"
-        + "S10: Hate. \n"
-        + "S11: Self-Harm. \n"
-        + "S12: Sexual Content. \n"
-        + "S13: Elections. \n"
-        + "S14: Code Interpreter Abuse.\n"
-        + "<END UNSAFE CONTENT CATEGORIES>\n"
-        + "\n"
-        + "<BEGIN CONVERSATION>\n"
-        + "\n"
-        + "User: "
-        + USER_PLACEHOLDER
-        + "\n"
-        + "<END CONVERSATION>\n"
-        + "Provide your safety assessment for ONLY THE LAST User message in the above"
-        + " conversation:\n"
-        + " - First line must read 'safe' or 'unsafe'.\n"
-        + " - If unsafe, a second line must include a comma-separated list of violated"
-        + " categories.";
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
deleted file mode 100644
index 0e388a5b0a4..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsActivity.java
+++ /dev/null
@@ -1,463 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-import android.app.AlertDialog;
-import android.content.DialogInterface;
-import android.os.Build;
-import android.os.Bundle;
-import android.text.Editable;
-import android.text.TextWatcher;
-import android.view.View;
-import android.widget.Button;
-import android.widget.EditText;
-import android.widget.ImageButton;
-import android.widget.TextView;
-import androidx.appcompat.app.AppCompatActivity;
-import androidx.core.content.ContextCompat;
-import androidx.core.graphics.Insets;
-import androidx.core.view.ViewCompat;
-import androidx.core.view.WindowInsetsCompat;
-import com.google.gson.Gson;
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-public class SettingsActivity extends AppCompatActivity {
-
-  private String mModelFilePath = "";
-  private String mTokenizerFilePath = "";
-  private TextView mBackendTextView;
-  private TextView mModelTextView;
-  private TextView mTokenizerTextView;
-  private TextView mModelTypeTextView;
-  private EditText mSystemPromptEditText;
-  private EditText mUserPromptEditText;
-  private Button mLoadModelButton;
-  private double mSetTemperature;
-  private String mSystemPrompt;
-  private String mUserPrompt;
-  private BackendType mBackendType;
-  private ModelType mModelType;
-  public SettingsFields mSettingsFields;
-
-  private DemoSharedPreferences mDemoSharedPreferences;
-  public static double TEMPERATURE_MIN_VALUE = 0.0;
-
-  @Override
-  protected void onCreate(Bundle savedInstanceState) {
-    super.onCreate(savedInstanceState);
-    setContentView(R.layout.activity_settings);
-    if (Build.VERSION.SDK_INT >= 21) {
-      getWindow().setStatusBarColor(ContextCompat.getColor(this, R.color.status_bar));
-      getWindow().setNavigationBarColor(ContextCompat.getColor(this, R.color.nav_bar));
-    }
-    ViewCompat.setOnApplyWindowInsetsListener(
-        requireViewById(R.id.main),
-        (v, insets) -> {
-          Insets systemBars = insets.getInsets(WindowInsetsCompat.Type.systemBars());
-          v.setPadding(systemBars.left, systemBars.top, systemBars.right, systemBars.bottom);
-          return insets;
-        });
-    mDemoSharedPreferences = new DemoSharedPreferences(getBaseContext());
-    mSettingsFields = new SettingsFields();
-    setupSettings();
-  }
-
-  private void setupSettings() {
-    mBackendTextView = requireViewById(R.id.backendTextView);
-    mModelTextView = requireViewById(R.id.modelTextView);
-    mTokenizerTextView = requireViewById(R.id.tokenizerTextView);
-    mModelTypeTextView = requireViewById(R.id.modelTypeTextView);
-    ImageButton backendImageButton = requireViewById(R.id.backendImageButton);
-    ImageButton modelImageButton = requireViewById(R.id.modelImageButton);
-    ImageButton tokenizerImageButton = requireViewById(R.id.tokenizerImageButton);
-    ImageButton modelTypeImageButton = requireViewById(R.id.modelTypeImageButton);
-    mSystemPromptEditText = requireViewById(R.id.systemPromptText);
-    mUserPromptEditText = requireViewById(R.id.userPromptText);
-    loadSettings();
-
-    // TODO: The two setOnClickListeners will be removed after file path issue is resolved
-    backendImageButton.setOnClickListener(
-        view -> {
-          setupBackendSelectorDialog();
-        });
-    modelImageButton.setOnClickListener(
-        view -> {
-          setupModelSelectorDialog();
-        });
-    tokenizerImageButton.setOnClickListener(
-        view -> {
-          setupTokenizerSelectorDialog();
-        });
-    modelTypeImageButton.setOnClickListener(
-        view -> {
-          setupModelTypeSelectorDialog();
-        });
-    mModelFilePath = mSettingsFields.getModelFilePath();
-    if (!mModelFilePath.isEmpty()) {
-      mModelTextView.setText(getFilenameFromPath(mModelFilePath));
-    }
-    mTokenizerFilePath = mSettingsFields.getTokenizerFilePath();
-    if (!mTokenizerFilePath.isEmpty()) {
-      mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath));
-    }
-    mModelType = mSettingsFields.getModelType();
-    ETLogging.getInstance().log("mModelType from settings " + mModelType);
-    if (mModelType != null) {
-      mModelTypeTextView.setText(mModelType.toString());
-    }
-    mBackendType = mSettingsFields.getBackendType();
-    ETLogging.getInstance().log("mBackendType from settings " + mBackendType);
-    if (mBackendType != null) {
-      mBackendTextView.setText(mBackendType.toString());
-      setBackendSettingMode();
-    }
-
-    setupParameterSettings();
-    setupPromptSettings();
-    setupClearChatHistoryButton();
-    setupLoadModelButton();
-  }
-
-  private void setupLoadModelButton() {
-    mLoadModelButton = requireViewById(R.id.loadModelButton);
-    mLoadModelButton.setEnabled(true);
-    mLoadModelButton.setOnClickListener(
-        view -> {
-          new AlertDialog.Builder(this)
-              .setTitle("Load Model")
-              .setMessage("Do you really want to load the new model?")
-              .setIcon(android.R.drawable.ic_dialog_alert)
-              .setPositiveButton(
-                  android.R.string.yes,
-                  new DialogInterface.OnClickListener() {
-                    public void onClick(DialogInterface dialog, int whichButton) {
-                      mSettingsFields.saveLoadModelAction(true);
-                      mLoadModelButton.setEnabled(false);
-                      onBackPressed();
-                    }
-                  })
-              .setNegativeButton(android.R.string.no, null)
-              .show();
-        });
-  }
-
-  private void setupClearChatHistoryButton() {
-    Button clearChatButton = requireViewById(R.id.clearChatButton);
-    clearChatButton.setOnClickListener(
-        view -> {
-          new AlertDialog.Builder(this)
-              .setTitle("Delete Chat History")
-              .setMessage("Do you really want to delete chat history?")
-              .setIcon(android.R.drawable.ic_dialog_alert)
-              .setPositiveButton(
-                  android.R.string.yes,
-                  new DialogInterface.OnClickListener() {
-                    public void onClick(DialogInterface dialog, int whichButton) {
-                      mSettingsFields.saveIsClearChatHistory(true);
-                    }
-                  })
-              .setNegativeButton(android.R.string.no, null)
-              .show();
-        });
-  }
-
-  private void setupParameterSettings() {
-    setupTemperatureSettings();
-  }
-
-  private void setupTemperatureSettings() {
-    mSetTemperature = mSettingsFields.getTemperature();
-    EditText temperatureEditText = requireViewById(R.id.temperatureEditText);
-    temperatureEditText.setText(String.valueOf(mSetTemperature));
-    temperatureEditText.addTextChangedListener(
-        new TextWatcher() {
-          @Override
-          public void beforeTextChanged(CharSequence s, int start, int count, int after) {}
-
-          @Override
-          public void onTextChanged(CharSequence s, int start, int before, int count) {}
-
-          @Override
-          public void afterTextChanged(Editable s) {
-            mSetTemperature = Double.parseDouble(s.toString());
-            // This is needed because temperature is changed together with model loading
-            // Once temperature is no longer in LlmModule constructor, we can remove this
-            mSettingsFields.saveLoadModelAction(true);
-            saveSettings();
-          }
-        });
-  }
-
-  private void setupPromptSettings() {
-    setupSystemPromptSettings();
-    setupUserPromptSettings();
-  }
-
-  private void setupSystemPromptSettings() {
-    mSystemPrompt = mSettingsFields.getSystemPrompt();
-    mSystemPromptEditText.setText(mSystemPrompt);
-    mSystemPromptEditText.addTextChangedListener(
-        new TextWatcher() {
-          @Override
-          public void beforeTextChanged(CharSequence s, int start, int count, int after) {}
-
-          @Override
-          public void onTextChanged(CharSequence s, int start, int before, int count) {}
-
-          @Override
-          public void afterTextChanged(Editable s) {
-            mSystemPrompt = s.toString();
-          }
-        });
-
-    ImageButton resetSystemPrompt = requireViewById(R.id.resetSystemPrompt);
-    resetSystemPrompt.setOnClickListener(
-        view -> {
-          new AlertDialog.Builder(this)
-              .setTitle("Reset System Prompt")
-              .setMessage("Do you really want to reset system prompt?")
-              .setIcon(android.R.drawable.ic_dialog_alert)
-              .setPositiveButton(
-                  android.R.string.yes,
-                  new DialogInterface.OnClickListener() {
-                    public void onClick(DialogInterface dialog, int whichButton) {
-                      // Clear the messageAdapter and sharedPreference
-                      mSystemPromptEditText.setText(PromptFormat.DEFAULT_SYSTEM_PROMPT);
-                    }
-                  })
-              .setNegativeButton(android.R.string.no, null)
-              .show();
-        });
-  }
-
-  private void setupUserPromptSettings() {
-    mUserPrompt = mSettingsFields.getUserPrompt();
-    mUserPromptEditText.setText(mUserPrompt);
-    mUserPromptEditText.addTextChangedListener(
-        new TextWatcher() {
-          @Override
-          public void beforeTextChanged(CharSequence s, int start, int count, int after) {}
-
-          @Override
-          public void onTextChanged(CharSequence s, int start, int before, int count) {}
-
-          @Override
-          public void afterTextChanged(Editable s) {
-            if (isValidUserPrompt(s.toString())) {
-              mUserPrompt = s.toString();
-            } else {
-              showInvalidPromptDialog();
-            }
-          }
-        });
-
-    ImageButton resetUserPrompt = requireViewById(R.id.resetUserPrompt);
-    resetUserPrompt.setOnClickListener(
-        view -> {
-          new AlertDialog.Builder(this)
-              .setTitle("Reset Prompt Template")
-              .setMessage("Do you really want to reset the prompt template?")
-              .setIcon(android.R.drawable.ic_dialog_alert)
-              .setPositiveButton(
-                  android.R.string.yes,
-                  new DialogInterface.OnClickListener() {
-                    public void onClick(DialogInterface dialog, int whichButton) {
-                      // Clear the messageAdapter and sharedPreference
-                      mUserPromptEditText.setText(
-                          PromptFormat.getUserPromptTemplate(mModelType, false));
-                    }
-                  })
-              .setNegativeButton(android.R.string.no, null)
-              .show();
-        });
-  }
-
-  private boolean isValidUserPrompt(String userPrompt) {
-    return userPrompt.contains(PromptFormat.USER_PLACEHOLDER);
-  }
-
-  private void showInvalidPromptDialog() {
-    new AlertDialog.Builder(this)
-        .setTitle("Invalid Prompt Format")
-        .setMessage(
-            "Prompt format must contain "
-                + PromptFormat.USER_PLACEHOLDER
-                + ". Do you want to reset prompt format?")
-        .setIcon(android.R.drawable.ic_dialog_alert)
-        .setPositiveButton(
-            android.R.string.yes,
-            (dialog, whichButton) -> {
-              mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType, false));
-            })
-        .setNegativeButton(android.R.string.no, null)
-        .show();
-  }
-
-  private void setupBackendSelectorDialog() {
-    // Convert enum to list
-    List<String> backendTypesList = new ArrayList<>();
-    for (BackendType backendType : BackendType.values()) {
-      backendTypesList.add(backendType.toString());
-    }
-    // Alert dialog builder takes in arr of string instead of list
-    String[] backendTypes = backendTypesList.toArray(new String[0]);
-    AlertDialog.Builder backendTypeBuilder = new AlertDialog.Builder(this);
-    backendTypeBuilder.setTitle("Select backend type");
-    backendTypeBuilder.setSingleChoiceItems(
-        backendTypes,
-        -1,
-        (dialog, item) -> {
-          mBackendTextView.setText(backendTypes[item]);
-          mBackendType = BackendType.valueOf(backendTypes[item]);
-          setBackendSettingMode();
-          dialog.dismiss();
-        });
-
-    backendTypeBuilder.create().show();
-  }
-
-  private void setupModelSelectorDialog() {
-    String[] pteFiles = listLocalFile("/data/local/tmp/llama/", new String[] {".pte"});
-    AlertDialog.Builder modelPathBuilder = new AlertDialog.Builder(this);
-    modelPathBuilder.setTitle("Select model path");
-
-    modelPathBuilder.setSingleChoiceItems(
-        pteFiles,
-        -1,
-        (dialog, item) -> {
-          mModelFilePath = pteFiles[item];
-          mModelTextView.setText(getFilenameFromPath(mModelFilePath));
-          mLoadModelButton.setEnabled(true);
-          dialog.dismiss();
-        });
-
-    modelPathBuilder.create().show();
-  }
-
-  private static boolean fileHasExtension(String file, String[] suffix) {
-    return Arrays.stream(suffix).anyMatch(entry -> file.endsWith(entry));
-  }
-
-  private static String[] listLocalFile(String path, String[] suffix) {
-    File directory = new File(path);
-    if (directory.exists() && directory.isDirectory()) {
-      File[] files = directory.listFiles((dir, name) -> (fileHasExtension(name, suffix)));
-      String[] result = new String[files.length];
-      for (int i = 0; i < files.length; i++) {
-        if (files[i].isFile() && fileHasExtension(files[i].getName(), suffix)) {
-          result[i] = files[i].getAbsolutePath();
-        }
-      }
-      return result;
-    }
-    return new String[] {};
-  }
-
-  private void setupModelTypeSelectorDialog() {
-    // Convert enum to list
-    List<String> modelTypesList = new ArrayList<>();
-    for (ModelType modelType : ModelType.values()) {
-      modelTypesList.add(modelType.toString());
-    }
-    // Alert dialog builder takes in arr of string instead of list
-    String[] modelTypes = modelTypesList.toArray(new String[0]);
-    AlertDialog.Builder modelTypeBuilder = new AlertDialog.Builder(this);
-    modelTypeBuilder.setTitle("Select model type");
-    modelTypeBuilder.setSingleChoiceItems(
-        modelTypes,
-        -1,
-        (dialog, item) -> {
-          mModelTypeTextView.setText(modelTypes[item]);
-          mModelType = ModelType.valueOf(modelTypes[item]);
-          mUserPromptEditText.setText(PromptFormat.getUserPromptTemplate(mModelType, false));
-          dialog.dismiss();
-        });
-
-    modelTypeBuilder.create().show();
-  }
-
-  private void setupTokenizerSelectorDialog() {
-    String[] tokenizerFiles =
-        listLocalFile("/data/local/tmp/llama/", new String[] {".bin", ".json", ".model"});
-    AlertDialog.Builder tokenizerPathBuilder = new AlertDialog.Builder(this);
-    tokenizerPathBuilder.setTitle("Select tokenizer path");
-    tokenizerPathBuilder.setSingleChoiceItems(
-        tokenizerFiles,
-        -1,
-        (dialog, item) -> {
-          mTokenizerFilePath = tokenizerFiles[item];
-          mTokenizerTextView.setText(getFilenameFromPath(mTokenizerFilePath));
-          mLoadModelButton.setEnabled(true);
-          dialog.dismiss();
-        });
-
-    tokenizerPathBuilder.create().show();
-  }
-
-  private String getFilenameFromPath(String uriFilePath) {
-    String[] segments = uriFilePath.split("/");
-    if (segments.length > 0) {
-      return segments[segments.length - 1]; // get last element (aka filename)
-    }
-    return "";
-  }
-
-  private void setBackendSettingMode() {
-    if (mBackendType.equals(BackendType.XNNPACK) || mBackendType.equals(BackendType.QUALCOMM)) {
-      setXNNPACKSettingMode();
-    } else if (mBackendType.equals(BackendType.MEDIATEK)) {
-      setMediaTekSettingMode();
-    }
-  }
-
-  private void setXNNPACKSettingMode() {
-    requireViewById(R.id.modelLayout).setVisibility(View.VISIBLE);
-    requireViewById(R.id.tokenizerLayout).setVisibility(View.VISIBLE);
-    requireViewById(R.id.parametersView).setVisibility(View.VISIBLE);
-    requireViewById(R.id.temperatureLayout).setVisibility(View.VISIBLE);
-    mModelFilePath = "";
-    mTokenizerFilePath = "";
-  }
-
-  private void setMediaTekSettingMode() {
-    requireViewById(R.id.modelLayout).setVisibility(View.GONE);
-    requireViewById(R.id.tokenizerLayout).setVisibility(View.GONE);
-    requireViewById(R.id.parametersView).setVisibility(View.GONE);
-    requireViewById(R.id.temperatureLayout).setVisibility(View.GONE);
-    mModelFilePath = "/in/mtk/llama/runner";
-    mTokenizerFilePath = "/in/mtk/llama/runner";
-  }
-
-  private void loadSettings() {
-    Gson gson = new Gson();
-    String settingsFieldsJSON = mDemoSharedPreferences.getSettings();
-    if (!settingsFieldsJSON.isEmpty()) {
-      mSettingsFields = gson.fromJson(settingsFieldsJSON, SettingsFields.class);
-    }
-  }
-
-  private void saveSettings() {
-    mSettingsFields.saveModelPath(mModelFilePath);
-    mSettingsFields.saveTokenizerPath(mTokenizerFilePath);
-    mSettingsFields.saveParameters(mSetTemperature);
-    mSettingsFields.savePrompts(mSystemPrompt, mUserPrompt);
-    mSettingsFields.saveModelType(mModelType);
-    mSettingsFields.saveBackendType(mBackendType);
-    mDemoSharedPreferences.addSettings(mSettingsFields);
-  }
-
-  @Override
-  public void onBackPressed() {
-    super.onBackPressed();
-    saveSettings();
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
deleted file mode 100644
index 94036f43947..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/SettingsFields.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-package com.example.executorchllamademo;
-
-public class SettingsFields {
-
-  public String getModelFilePath() {
-    return modelFilePath;
-  }
-
-  public String getTokenizerFilePath() {
-    return tokenizerFilePath;
-  }
-
-  public double getTemperature() {
-    return temperature;
-  }
-
-  public String getSystemPrompt() {
-    return systemPrompt;
-  }
-
-  public ModelType getModelType() {
-    return modelType;
-  }
-
-  public BackendType getBackendType() {
-    return backendType;
-  }
-
-  public String getUserPrompt() {
-    return userPrompt;
-  }
-
-  public String getFormattedSystemAndUserPrompt(String prompt, boolean thinkingMode) {
-    return getFormattedSystemPrompt() + getFormattedUserPrompt(prompt, thinkingMode);
-  }
-
-  public String getFormattedSystemPrompt() {
-    return PromptFormat.getSystemPromptTemplate(modelType)
-        .replace(PromptFormat.SYSTEM_PLACEHOLDER, systemPrompt);
-  }
-
-  public String getFormattedUserPrompt(String prompt, boolean thinkingMode) {
-    return userPrompt
-        .replace(PromptFormat.USER_PLACEHOLDER, prompt)
-        .replace(
-            PromptFormat.THINKING_MODE_PLACEHOLDER,
-            PromptFormat.getThinkingModeToken(modelType, thinkingMode));
-  }
-
-  public boolean getIsClearChatHistory() {
-    return isClearChatHistory;
-  }
-
-  public boolean getIsLoadModel() {
-    return isLoadModel;
-  }
-
-  private String modelFilePath;
-  private String tokenizerFilePath;
-  private double temperature;
-  private String systemPrompt;
-  private String userPrompt;
-  private boolean isClearChatHistory;
-  private boolean isLoadModel;
-  private ModelType modelType;
-  private BackendType backendType;
-
-  public SettingsFields() {
-    ModelType DEFAULT_MODEL = ModelType.LLAMA_3;
-    BackendType DEFAULT_BACKEND = BackendType.XNNPACK;
-
-    modelFilePath = "";
-    tokenizerFilePath = "";
-    temperature = SettingsActivity.TEMPERATURE_MIN_VALUE;
-    systemPrompt = "";
-    userPrompt = PromptFormat.getUserPromptTemplate(DEFAULT_MODEL, false);
-    isClearChatHistory = false;
-    isLoadModel = false;
-    modelType = DEFAULT_MODEL;
-    backendType = DEFAULT_BACKEND;
-  }
-
-  public SettingsFields(SettingsFields settingsFields) {
-    this.modelFilePath = settingsFields.modelFilePath;
-    this.tokenizerFilePath = settingsFields.tokenizerFilePath;
-    this.temperature = settingsFields.temperature;
-    this.systemPrompt = settingsFields.getSystemPrompt();
-    this.userPrompt = settingsFields.getUserPrompt();
-    this.isClearChatHistory = settingsFields.getIsClearChatHistory();
-    this.isLoadModel = settingsFields.getIsLoadModel();
-    this.modelType = settingsFields.modelType;
-    this.backendType = settingsFields.backendType;
-  }
-
-  public void saveModelPath(String modelFilePath) {
-    this.modelFilePath = modelFilePath;
-  }
-
-  public void saveTokenizerPath(String tokenizerFilePath) {
-    this.tokenizerFilePath = tokenizerFilePath;
-  }
-
-  public void saveModelType(ModelType modelType) {
-    this.modelType = modelType;
-  }
-
-  public void saveBackendType(BackendType backendType) {
-    this.backendType = backendType;
-  }
-
-  public void saveParameters(Double temperature) {
-    this.temperature = temperature;
-  }
-
-  public void savePrompts(String systemPrompt, String userPrompt) {
-    this.systemPrompt = systemPrompt;
-    this.userPrompt = userPrompt;
-  }
-
-  public void saveIsClearChatHistory(boolean needToClear) {
-    this.isClearChatHistory = needToClear;
-  }
-
-  public void saveLoadModelAction(boolean shouldLoadModel) {
-    this.isLoadModel = shouldLoadModel;
-  }
-
-  public boolean equals(SettingsFields anotherSettingsFields) {
-    if (this == anotherSettingsFields) return true;
-    return modelFilePath.equals(anotherSettingsFields.modelFilePath)
-        && tokenizerFilePath.equals(anotherSettingsFields.tokenizerFilePath)
-        && temperature == anotherSettingsFields.temperature
-        && systemPrompt.equals(anotherSettingsFields.systemPrompt)
-        && userPrompt.equals(anotherSettingsFields.userPrompt)
-        && isClearChatHistory == anotherSettingsFields.isClearChatHistory
-        && isLoadModel == anotherSettingsFields.isLoadModel
-        && modelType == anotherSettingsFields.modelType
-        && backendType == anotherSettingsFields.backendType;
-  }
-}
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
deleted file mode 100644
index 0868ffffa6f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/banner_shape.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<shape xmlns:android="http://schemas.android.com/apk/res/android"
-    android:shape="rectangle">
-    <solid android:color="#16293D" />
-</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
deleted file mode 100644
index 2ae27b8409e..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_24.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M19,13h-6v6h-2v-6H5v-2h6V5h2v6h6v2z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
deleted file mode 100644
index 7077fedd483..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_add_photo_alternate_24.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M19,7v2.99s-1.99,0.01 -2,0L17,7h-3s0.01,-1.99 0,-2h3L17,2h2v3h3v2h-3zM16,11L16,8h-3L13,5L5,5c-1.1,0 -2,0.9 -2,2v12c0,1.1 0.9,2 2,2h12c1.1,0 2,-0.9 2,-2v-8h-3zM5,19l3,-4 2,3 3,-4 4,5L5,19z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
deleted file mode 100644
index a6837b9c69f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_article_24.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:autoMirrored="true" android:height="24dp" android:tint="#FFFFFF
-" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M19,3L5,3c-1.1,0 -2,0.9 -2,2v14c0,1.1 0.9,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM14,17L7,17v-2h7v2zM17,13L7,13v-2h10v2zM17,9L7,9L7,7h10v2z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
deleted file mode 100644
index fb902d4331b..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_close_24.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF
-" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M19,6.41L17.59,5 12,10.59 6.41,5 5,6.41 10.59,12 5,17.59 6.41,19 12,13.41 17.59,19 19,17.59 13.41,12z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
deleted file mode 100644
index 4680bc6629e..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_delete_forever_24.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M6,19c0,1.1 0.9,2 2,2h8c1.1,0 2,-0.9 2,-2L18,7L6,7v12zM8.46,11.88l1.41,-1.41L12,12.59l2.12,-2.12 1.41,1.41L13.41,14l2.12,2.12 -1.41,1.41L12,15.41l-2.12,2.12 -1.41,-1.41L10.59,14l-2.13,-2.12zM15.5,4l-1,-1h-5l-1,1L5,4v2h14L19,4z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_lightbulb_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_lightbulb_24.xml
deleted file mode 100644
index aa045396d28..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_lightbulb_24.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M9,21c0,0.5 0.4,1 1,1h4c0.6,0 1,-0.5 1,-1v-1L9,20v1zM12,2C8.1,2 5,5.1 5,9c0,2.4 1.2,4.5 3,5.7L8,17c0,0.5 0.4,1 1,1h6c0.6,0 1,-0.5 1,-1v-2.3c1.8,-1.3 3,-3.4 3,-5.7 0,-3.9 -3.1,-7 -7,-7z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
deleted file mode 100644
index 860470ab109..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_restart_alt_24.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<vector android:height="24dp" android:tint="#FFFFFF"
-    android:viewportHeight="24" android:viewportWidth="24"
-    android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
-    <path android:fillColor="@android:color/white" android:pathData="M12,5V2L8,6l4,4V7c3.31,0 6,2.69 6,6c0,2.97 -2.17,5.43 -5,5.91v2.02c3.95,-0.49 7,-3.85 7,-7.93C20,8.58 16.42,5 12,5z"/>
-    <path android:fillColor="@android:color/white" android:pathData="M6,13c0,-1.65 0.67,-3.15 1.76,-4.24L6.34,7.34C4.9,8.79 4,10.79 4,13c0,4.08 3.05,7.44 7,7.93v-2.02C8.17,18.43 6,15.97 6,13z"/>
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
deleted file mode 100644
index 2de1f642089..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_send_24.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<vector android:autoMirrored="true" android:height="24dp"
-    android:tint="#FFFFFF
-" android:viewportHeight="24"
-    android:viewportWidth="24" android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
-    <path android:fillColor="@android:color/white" android:pathData="M2.01,21L23,12 2.01,3 2,10l15,2 -15,2z"/>
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
deleted file mode 100644
index c51d84b9f4f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_settings_24.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="24dp"
-    android:height="24dp"
-    android:viewportWidth="960"
-    android:viewportHeight="960"
-    android:tint="#FFFFFF
-">
-    <path
-        android:fillColor="@android:color/black"
-        android:pathData="M387.69,860L372.46,738.15Q356.39,732.77 339.5,723.08Q322.62,713.38 309.31,702.31L196.46,750L104.16,590L201.77,516.23Q200.39,507.31 199.81,498.31Q199.23,489.31 199.23,480.38Q199.23,471.85 199.81,463.04Q200.39,454.23 201.77,443.77L104.16,370L196.46,210.77L308.92,258.08Q323.39,246.62 339.81,237.12Q356.23,227.62 372.08,221.85L387.69,100L572.31,100L587.54,222.23Q605.54,228.77 620.11,237.5Q634.69,246.23 649.54,258.08L763.54,210.77L855.84,370L756.69,444.92Q758.84,454.61 759.04,463.04Q759.23,471.46 759.23,480Q759.23,488.15 758.84,496.58Q758.46,505 756.08,515.85L854.46,590L762.15,750L649.54,701.92Q634.69,713.77 619.23,722.88Q603.77,732 587.54,737.77L572.31,860L387.69,860ZM440,800L518.62,800L533,692.85Q563.62,684.85 588.96,670.12Q614.31,655.38 637.85,632.23L737.23,674L776.62,606L689.85,540.62Q694.85,525.08 696.65,510.15Q698.46,495.23 698.46,480Q698.46,464.38 696.65,449.85Q694.85,435.31 689.85,420.15L777.38,354L738,286L637.46,328.38Q617.38,306.92 589.35,290.46Q561.31,274 532.62,267.15L520,160L440.62,160L427.38,266.77Q396.77,274 370.85,288.92Q344.92,303.85 321.38,327.38L222,286L182.62,354L269,418.38Q264,432.62 262,448Q260,463.38 260,480.38Q260,496 262,511Q264,526 268.62,540.62L182.62,606L222,674L321,632Q343.77,655.38 369.69,670.31Q395.62,685.23 427,693.23L440,800ZM480.46,600Q530.38,600 565.42,564.96Q600.46,529.92 600.46,480Q600.46,430.08 565.42,395.04Q530.38,360 480.46,360Q429.92,360 395.19,395.04Q360.46,430.08 360.46,480Q360.46,529.92 395.19,564.96Q429.92,600 480.46,600ZM480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480Q480,480 480,480L480,480L480,480L480,480Q480,480 480,480Q480,480 480,480L480,480Z"/>
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
deleted file mode 100644
index 832e2585954..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/baseline_stop_24.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#FFFFFF
-" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M6,6h12v12H6z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/blue_lightbulb_24.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/blue_lightbulb_24.xml
deleted file mode 100644
index 585cd3b1892..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/blue_lightbulb_24.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="24dp" android:tint="#6684EC" android:viewportHeight="24" android:viewportWidth="24" android:width="24dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M9,21c0,0.5 0.4,1 1,1h4c0.6,0 1,-0.5 1,-1v-1L9,20v1zM12,2C8.1,2 5,5.1 5,9c0,2.4 1.2,4.5 3,5.7L8,17c0,0.5 0.4,1 1,1h6c0.6,0 1,-0.5 1,-1v-2.3c1.8,-1.3 3,-3.4 3,-5.7 0,-3.9 -3.1,-7 -7,-7z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml
deleted file mode 100644
index ceb3ac56c9e..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/btn.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<selector xmlns:android="http://schemas.android.com/apk/res/android">
-    <!-- Disable background -->
-    <item android:state_enabled="false"
-        android:color="@color/btn_disabled"/>
-    <!-- Enabled background -->
-    <item android:color="@color/btn_enabled"/>
-</selector>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml
deleted file mode 100644
index eb8b9d1f1a9..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/chat_background.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:aapt="http://schemas.android.com/aapt"
-    android:width="412dp"
-    android:height="893dp"
-    android:viewportWidth="412"
-    android:viewportHeight="893">
-  <path
-      android:pathData="M0,0h412v893h-412z">
-    <aapt:attr name="android:fillColor">
-      <gradient 
-          android:startX="206"
-          android:startY="0"
-          android:endX="206"
-          android:endY="893"
-          android:type="linear">
-        <item android:offset="0.05" android:color="#FF16293D"/>
-        <item android:offset="0.9" android:color="#FF192E4D"/>
-      </gradient>
-    </aapt:attr>
-  </path>
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml
deleted file mode 100644
index 87c82d2a38d..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/custom_button_round.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle">
-    <solid android:color="#6080F0"/>
-    <corners android:radius="500dp"/>
-    <size android:width="100dp"
-        android:height="100dp"/>
-</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml
deleted file mode 100644
index 0a7a71f0700..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/expand_circle_down.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="24dp"
-    android:height="18dp"
-    android:viewportWidth="15"
-    android:viewportHeight="10">
-  <path
-      android:pathData="M15,2.373L7.5,10L0,2.373L2.375,0L7.5,5.212L12.625,0L15,2.373Z"
-      android:fillColor="#F4F4F4"/>
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_background.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_background.xml
deleted file mode 100644
index 07d5da9cbf1..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_background.xml
+++ /dev/null
@@ -1,170 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path
-        android:fillColor="#3DDC84"
-        android:pathData="M0,0h108v108h-108z" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M9,0L9,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,0L19,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,0L29,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,0L39,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,0L49,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,0L59,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,0L69,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,0L79,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M89,0L89,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M99,0L99,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,9L108,9"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,19L108,19"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,29L108,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,39L108,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,49L108,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,59L108,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,69L108,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,79L108,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,89L108,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,99L108,99"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,29L89,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,39L89,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,49L89,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,59L89,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,69L89,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,79L89,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,19L29,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,19L39,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,19L49,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,19L59,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,19L69,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,19L79,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_foreground.xml
deleted file mode 100644
index 7706ab9e6d4..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/ic_launcher_foreground.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:aapt="http://schemas.android.com/aapt"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
-        <aapt:attr name="android:fillColor">
-            <gradient
-                android:endX="85.84757"
-                android:endY="92.4963"
-                android:startX="42.9492"
-                android:startY="49.59793"
-                android:type="linear">
-                <item
-                    android:color="#44000000"
-                    android:offset="0.0" />
-                <item
-                    android:color="#00000000"
-                    android:offset="1.0" />
-            </gradient>
-        </aapt:attr>
-    </path>
-    <path
-        android:fillColor="#FFFFFF"
-        android:fillType="nonZero"
-        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
-        android:strokeWidth="1"
-        android:strokeColor="#00000000" />
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
deleted file mode 100644
index 35c778a437d..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/input_text_shape.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<shape xmlns:android="http://schemas.android.com/apk/res/android"
-    android:shape="rectangle">
-    <solid android:color="#081D2C" />
-    <corners android:radius="20dp"/>
-    <padding android:layout_marginTop="5dp" android:layout_marginBottom="5dp" android:left="10dp" android:right="10dp"/>
-</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png
deleted file mode 100644
index 60e3e5174e9..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/logo.png and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
deleted file mode 100644
index bb45d63d85b..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_add_box_48.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#ffffff
-" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M19,3L5,3c-1.11,0 -2,0.9 -2,2v14c0,1.1 0.89,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM19,19L5,19L5,5h14v14zM11,17h2v-4h4v-2h-4L13,7h-2v4L7,11v2h4z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml
deleted file mode 100644
index c7b4b2e4a1d..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_camera_alt_48.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M20,4h-3.17L15,2L9,2L7.17,4L4,4c-1.1,0 -2,0.9 -2,2v12c0,1.1 0.9,2 2,2h16c1.1,0 2,-0.9 2,-2L22,6c0,-1.1 -0.9,-2 -2,-2zM20,18L4,18L4,6h4.05l1.83,-2h4.24l1.83,2L20,6v12zM12,7c-2.76,0 -5,2.24 -5,5s2.24,5 5,5 5,-2.24 5,-5 -2.24,-5 -5,-5zM12,15c-1.65,0 -3,-1.35 -3,-3s1.35,-3 3,-3 3,1.35 3,3 -1.35,3 -3,3z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml
deleted file mode 100644
index a8bb4b2f646..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/outline_image_48.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android" android:height="48dp" android:tint="#000000" android:viewportHeight="24" android:viewportWidth="24" android:width="48dp">
-      
-    <path android:fillColor="@android:color/white" android:pathData="M19,5v14L5,19L5,5h14m0,-2L5,3c-1.1,0 -2,0.9 -2,2v14c0,1.1 0.9,2 2,2h14c1.1,0 2,-0.9 2,-2L21,5c0,-1.1 -0.9,-2 -2,-2zM14.14,11.86l-3,3.87L9,13.14 6,17h12l-3.86,-5.14z"/>
-    
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
deleted file mode 100644
index 5f81396e382..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/prompt_shape.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<shape xmlns:android="http://schemas.android.com/apk/res/android"
-android:shape="rectangle">
-<solid android:color="#081D2C" />
-<corners android:radius="4dp" />
-</shape>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
deleted file mode 100644
index c2288b5bfce..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/received_message.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<shape xmlns:android="http://schemas.android.com/apk/res/android"
-    android:shape="rectangle">
-    <solid android:color="#081D2C" />
-    <corners android:radius="10dp" />
-</shape>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml
deleted file mode 100644
index e8d13ca4e12..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/sent_message.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<shape xmlns:android="http://schemas.android.com/apk/res/android"
-    android:shape="rectangle">
-    <solid android:color="@color/colorPrimary" />
-    <corners android:radius="10dp" />
-</shape>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml
deleted file mode 100644
index afbe22da808..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/drawable/three_dots.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<vector android:height="24dp" android:tint="#000000"
-    android:viewportHeight="24" android:viewportWidth="24"
-    android:width="24dp" xmlns:android="http://schemas.android.com/apk/res/android">
-    <path android:fillColor="@android:color/white" android:pathData="M6,10c-1.1,0 -2,0.9 -2,2s0.9,2 2,2 2,-0.9 2,-2 -0.9,-2 -2,-2zM18,10c-1.1,0 -2,0.9 -2,2s0.9,2 2,2 2,-0.9 2,-2 -0.9,-2 -2,-2zM12,10c-1.1,0 -2,0.9 -2,2s0.9,2 2,2 2,-0.9 2,-2 -0.9,-2 -2,-2z"/>
-</vector>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml
deleted file mode 100644
index 6e48b5de8be..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_benchmarking.xml
+++ /dev/null
@@ -1,16 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:orientation="vertical"
-    android:clipToPadding="false"
-    android:focusableInTouchMode="true"
-    tools:context=".LlmBenchmarkRunner">
-
-    <TextView
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:id="@+id/log_view" />
-
-</LinearLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml
deleted file mode 100644
index b327a544f25..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_logs.xml
+++ /dev/null
@@ -1,55 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:id="@+id/main"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context=".LogsActivity">
-
-    <LinearLayout
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:orientation="vertical">
-
-        <LinearLayout
-            android:id="@+id/top_banner"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:background="@drawable/banner_shape"
-            android:orientation="horizontal">
-
-            <TextView
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:paddingLeft="10dp"
-                android:paddingTop="20dp"
-                android:paddingBottom="7dp"
-                android:text="Logs"
-                android:textColor="@android:color/white"
-                android:textSize="20sp"
-                android:textStyle="bold" />
-            <View
-                android:layout_width="0dp"
-                android:layout_height="0dp"
-                android:layout_weight="1"
-                />
-            <ImageButton
-                android:id="@+id/clearLogsButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:paddingTop="20dp"
-                android:backgroundTint="@android:color/transparent"
-                android:src="@drawable/baseline_delete_forever_24"
-                />
-        </LinearLayout>
-
-        <ListView
-            android:id="@+id/logsListView"
-            android:layout_width="match_parent"
-            android:layout_height="match_parent">
-
-        </ListView>
-    </LinearLayout>
-
-</androidx.constraintlayout.widget.ConstraintLayout>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
deleted file mode 100644
index 52bf533521a..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_main.xml
+++ /dev/null
@@ -1,241 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="wrap_content"
-    android:layout_height="match_parent"
-    android:background="#DCD7D7"
-    android:clipToPadding="false"
-    android:focusableInTouchMode="true"
-    android:orientation="vertical"
-    tools:context=".MainActivity">
-
-    <LinearLayout
-        android:id="@+id/top_banner"
-        android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:background="@drawable/banner_shape"
-        android:orientation="horizontal">
-
-        <TextView
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:paddingLeft="20dp"
-            android:paddingTop="20dp"
-            android:text="Chat with Llama"
-            android:textColor="@android:color/white"
-            android:textSize="16sp"
-            android:textStyle="bold" />
-
-        <TextView
-            android:id="@+id/ram_usage_live"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_weight="1"
-            android:fontFamily="sans-serif-black"
-            android:paddingLeft="5dp"
-            android:text="0 MB"
-            android:textAlignment="viewEnd"
-            android:textColor="#FFFFFF"
-            android:textSize="14sp" />
-
-        <ImageButton
-            android:id="@+id/showLogsButton"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:backgroundTint="@android:color/transparent"
-            android:paddingTop="20dp"
-            android:src="@drawable/baseline_article_24" />
-
-        <ImageButton
-            android:id="@+id/settings"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_alignParentRight="true"
-            android:backgroundTint="@android:color/transparent"
-            android:paddingTop="20dp"
-            android:src="@drawable/baseline_settings_24" />
-
-    </LinearLayout>
-
-    <LinearLayout
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:orientation="vertical">
-
-        <ListView
-            android:id="@+id/messages_view"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_weight="2"
-            android:background="@drawable/chat_background"
-            android:divider="#fff"
-            android:stackFromBottom="true"
-            android:transcriptMode="alwaysScroll" />
-
-        <androidx.constraintlayout.widget.ConstraintLayout
-            android:id="@+id/mediaPreviewConstraintLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:background="#16293D"
-            android:visibility="gone">
-
-            <HorizontalScrollView
-                android:id="@+id/mediaPreviewScrollView"
-                android:layout_width="match_parent"
-                android:layout_height="wrap_content"
-                android:padding="5dp"
-                app:layout_constraintEnd_toStartOf="@id/mediaPreviewCloseButton"
-                app:layout_constraintStart_toStartOf="parent"
-                app:layout_constraintTop_toTopOf="parent">
-
-                <LinearLayout
-                    android:layout_width="wrap_content"
-                    android:layout_height="wrap_content"
-                    android:orientation="horizontal">
-
-                    <ImageView
-                        android:id="@+id/mediaPreviewImageView1"
-                        android:layout_width="80dp"
-                        android:layout_height="80dp"
-                        android:visibility="gone"
-                        app:srcCompat="@drawable/ic_launcher_foreground" />
-
-                    <ImageView
-                        android:id="@+id/mediaPreviewImageView2"
-                        android:layout_width="80dp"
-                        android:layout_height="80dp"
-                        android:layout_marginStart="10dp"
-                        android:visibility="gone"
-                        app:srcCompat="@drawable/ic_launcher_foreground" />
-
-                    <ImageView
-                        android:id="@+id/mediaPreviewImageView3"
-                        android:layout_width="80dp"
-                        android:layout_height="80dp"
-                        android:layout_marginStart="10dp"
-                        android:visibility="gone"
-                        app:srcCompat="@drawable/ic_launcher_foreground" />
-
-                    <ImageView
-                        android:id="@+id/mediaPreviewImageView4"
-                        android:layout_width="80dp"
-                        android:layout_height="80dp"
-                        android:layout_marginStart="10dp"
-                        android:visibility="gone"
-                        app:srcCompat="@drawable/ic_launcher_foreground" />
-
-                    <ImageView
-                        android:id="@+id/mediaPreviewImageView5"
-                        android:layout_width="80dp"
-                        android:layout_height="80dp"
-                        android:layout_marginStart="10dp"
-                        android:visibility="gone"
-                        app:srcCompat="@drawable/ic_launcher_foreground" />
-
-                    <ImageButton
-                        android:id="@+id/addMoreImageButton"
-                        android:layout_width="80dp"
-                        android:layout_height="80dp"
-                        android:background="#16293D"
-                        android:padding="5dp"
-                        android:src="@drawable/outline_add_box_48" />
-
-
-                </LinearLayout>
-
-
-            </HorizontalScrollView>
-
-            <ImageButton
-                android:id="@+id/mediaPreviewCloseButton"
-                android:layout_width="24dp"
-                android:layout_height="24dp"
-                android:background="@android:color/transparent"
-                android:padding="5dp"
-                android:src="@drawable/baseline_close_24"
-                app:layout_constraintEnd_toEndOf="parent"
-                app:layout_constraintTop_toTopOf="parent" />
-
-
-        </androidx.constraintlayout.widget.ConstraintLayout>
-
-        <LinearLayout
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:background="#16293D"
-            android:orientation="horizontal">
-
-            <ImageButton
-                android:id="@+id/addMediaButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:background="@android:color/transparent"
-                android:padding="10dp"
-                android:src="@drawable/baseline_add_24" />
-
-            <ImageButton
-                android:id="@+id/thinkModeButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:background="@android:color/transparent"
-                android:padding="10dp"
-                android:src="@drawable/baseline_lightbulb_24" />
-
-            <EditText
-                android:id="@+id/editTextMessage"
-                android:layout_width="match_parent"
-                android:layout_height="35dp"
-                android:layout_weight="2"
-                android:background="@drawable/input_text_shape"
-                android:ems="8"
-                android:inputType="text"
-                android:paddingHorizontal="10dp"
-                android:text=""
-                android:textColor="#ffffff"
-                android:textColorHint="#ffffff"
-                android:translationY="5dp" />
-
-            <ImageButton
-                android:id="@+id/sendButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:background="@android:color/transparent"
-                android:padding="10dp"
-                android:src="@drawable/baseline_send_24" />
-        </LinearLayout>
-
-        <LinearLayout
-            android:id="@+id/addMediaLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:background="#16293D"
-            android:orientation="vertical">
-
-            <LinearLayout
-                android:layout_width="wrap_content"
-                android:layout_height="match_parent"
-                android:layout_gravity="center"
-                android:orientation="horizontal"
-                android:paddingTop="20dp"
-                android:paddingBottom="20dp">
-
-                <ImageButton
-                    android:id="@+id/cameraButton"
-                    android:layout_width="80dp"
-                    android:layout_height="80dp"
-                    android:background="@drawable/custom_button_round"
-                    android:src="@drawable/outline_camera_alt_48" />
-
-                <ImageButton
-                    android:id="@+id/galleryButton"
-                    android:layout_width="80dp"
-                    android:layout_height="80dp"
-                    android:layout_marginStart="40dp"
-                    android:background="@drawable/custom_button_round"
-                    android:src="@drawable/outline_image_48" />
-            </LinearLayout>
-        </LinearLayout>
-
-    </LinearLayout>
-</LinearLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
deleted file mode 100644
index 0ec551ae364..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/activity_settings.xml
+++ /dev/null
@@ -1,338 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:id="@+id/main"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context=".SettingsActivity">
-
-    <LinearLayout
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:background="#16293D"
-        android:orientation="vertical"
-        app:layout_constraintTop_toTopOf="parent"
-        tools:layout_editor_absoluteX="1dp">
-
-        <TextView
-            android:id="@+id/textView"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:fontFamily="sans-serif-medium"
-            android:text="Settings"
-            android:textAlignment="viewStart"
-            android:textColor="#FFFFFF"
-            android:textSize="22sp"
-            android:translationX="5dp"
-            android:translationY="5dp" />
-
-        <LinearLayout
-            android:id="@+id/backendLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginTop="40dp"
-            android:orientation="horizontal">
-
-            <TextView
-                android:id="@+id/backendLabel"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:gravity="center_vertical"
-                android:text="Backend"
-                android:textColor="#FFFFFF"
-                android:textSize="16sp"
-                android:translationX="5dp" />
-
-            <TextView
-                android:id="@+id/backendTextView"
-                android:layout_width="0dp"
-                android:layout_height="match_parent"
-                android:layout_weight="1"
-                android:gravity="center_vertical|end"
-                android:text="no backend selected"
-                android:textColor="#FFFFFF" />
-
-            <ImageButton
-                android:id="@+id/backendImageButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:layout_marginStart="5dp"
-                android:background="#00FFFFFF"
-                android:scaleType="center"
-                android:scaleX="0.7"
-                android:scaleY="0.7"
-                android:src="@drawable/expand_circle_down" />
-
-        </LinearLayout>
-
-        <LinearLayout
-            android:id="@+id/modelLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginTop="20dp"
-            android:orientation="horizontal">
-
-            <TextView
-                android:id="@+id/modelLabel"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:gravity="center_vertical"
-                android:text="Model"
-                android:textColor="#FFFFFF"
-                android:textSize="16sp"
-                android:translationX="5dp" />
-
-            <TextView
-                android:id="@+id/modelTextView"
-                android:layout_width="0dp"
-                android:layout_height="match_parent"
-                android:layout_weight="1"
-                android:gravity="center_vertical|end"
-                android:text="no model selected"
-                android:textColor="#FFFFFF" />
-
-            <ImageButton
-                android:id="@+id/modelImageButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:layout_marginStart="5dp"
-                android:background="#00FFFFFF"
-                android:scaleType="center"
-                android:scaleX="0.7"
-                android:scaleY="0.7"
-                android:src="@drawable/expand_circle_down" />
-
-        </LinearLayout>
-
-        <LinearLayout
-            android:id="@+id/tokenizerLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginTop="20dp"
-            android:orientation="horizontal">
-
-            <TextView
-                android:id="@+id/tokenizerLabel"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:gravity="center_vertical"
-                android:text="Tokenizer"
-                android:textColor="#FDFDFD"
-                android:textSize="16sp"
-                android:translationX="5dp" />
-
-            <TextView
-                android:id="@+id/tokenizerTextView"
-                android:layout_width="0dp"
-                android:layout_height="match_parent"
-                android:layout_weight="1"
-                android:gravity="center_vertical|end"
-                android:text="no tokenizer selected"
-                android:textColor="#FFFFFF" />
-
-            <ImageButton
-                android:id="@+id/tokenizerImageButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:layout_marginStart="5dp"
-                android:background="#00FFFFFF"
-                android:scaleX="0.7"
-                android:scaleY="0.7"
-                android:src="@drawable/expand_circle_down" />
-
-        </LinearLayout>
-
-        <LinearLayout
-            android:id="@+id/modelTypeLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginTop="20dp"
-            android:orientation="horizontal">
-
-            <TextView
-                android:id="@+id/modelTypeLabel"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:gravity="center_vertical"
-                android:text="Model Type"
-                android:textColor="#FFFFFF"
-                android:textSize="16sp"
-                android:translationX="5dp" />
-
-            <TextView
-                android:id="@+id/modelTypeTextView"
-                android:layout_width="0dp"
-                android:layout_height="match_parent"
-                android:layout_weight="1"
-                android:gravity="center_vertical|end"
-                android:text="no model type selected"
-                android:textColor="#FFFFFF" />
-
-            <ImageButton
-                android:id="@+id/modelTypeImageButton"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:layout_marginStart="5dp"
-                android:background="#00FFFFFF"
-                android:scaleX="0.7"
-                android:scaleY="0.7"
-                android:src="@drawable/expand_circle_down" />
-
-        </LinearLayout>
-
-        <Button
-            android:id="@+id/loadModelButton"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_gravity="center"
-            android:layout_marginTop="10dp"
-            android:paddingHorizontal="10dp"
-            android:text="Load Model"
-            android:textColor="@android:color/white"
-            android:textSize="14sp"
-            android:theme="@style/DefaultButton" />
-
-        <TextView
-            android:id="@+id/parametersView"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginTop="20dp"
-            android:layout_marginBottom="20dp"
-            android:text="Parameters"
-            android:textColor="#FFFFFF"
-            android:textSize="18sp"
-            android:textStyle="bold"
-            android:translationX="5dp" />
-
-        <LinearLayout
-            android:id="@+id/temperatureLayout"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginBottom="10dp"
-            android:orientation="horizontal">
-
-            <TextView
-                android:id="@+id/textView5"
-                android:layout_width="150dp"
-                android:layout_height="wrap_content"
-                android:text="Temperature"
-                android:textColor="#FFFFFF"
-                android:textSize="16sp"
-                android:translationX="5dp" />
-
-            <EditText
-                android:id="@+id/temperatureEditText"
-                android:layout_width="wrap_content"
-                android:layout_height="wrap_content"
-                android:layout_weight="1"
-                android:ems="10"
-                android:inputType="numberDecimal"
-                android:text="0.1"
-                android:textAlignment="textEnd"
-                android:textColor="#FFFFFF"
-                android:textColorHint="#FFFFFF"
-                android:textSize="16sp" />
-        </LinearLayout>
-
-        <LinearLayout
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginBottom="10dp"
-            android:orientation="vertical">
-
-            <LinearLayout
-                android:layout_width="match_parent"
-                android:layout_height="wrap_content"
-                android:orientation="horizontal">
-
-                <TextView
-                    android:id="@+id/systemPromptTitle"
-                    android:layout_width="wrap_content"
-                    android:layout_height="wrap_content"
-                    android:layout_marginTop="20dp"
-                    android:layout_marginBottom="20dp"
-                    android:text="System Prompt"
-                    android:textColor="#FFFAFA"
-                    android:textSize="18sp"
-                    android:textStyle="bold"
-                    android:translationX="5dp" />
-
-                <ImageButton
-                    android:id="@+id/resetSystemPrompt"
-                    android:layout_width="wrap_content"
-                    android:layout_height="wrap_content"
-                    android:layout_marginTop="10dp"
-                    android:backgroundTint="@android:color/transparent"
-                    android:src="@drawable/baseline_restart_alt_24" />
-            </LinearLayout>
-
-
-            <EditText
-                android:id="@+id/systemPromptText"
-                android:layout_width="match_parent"
-                android:layout_height="wrap_content"
-                android:height="60dp"
-                android:background="@drawable/prompt_shape"
-                android:hint="Type custom system prompt"
-                android:textColor="#FFFFFF"
-                android:textColorHint="#FFFCFC"
-                android:textSize="16sp" />
-        </LinearLayout>
-
-        <LinearLayout
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_marginBottom="10dp"
-            android:orientation="vertical">
-
-            <LinearLayout
-                android:layout_width="match_parent"
-                android:layout_height="wrap_content"
-                android:orientation="horizontal">
-
-                <TextView
-                    android:id="@+id/userPromptTitle"
-                    android:layout_width="wrap_content"
-                    android:layout_height="wrap_content"
-                    android:layout_marginTop="20dp"
-                    android:layout_marginBottom="20dp"
-                    android:text="Prompt Format"
-                    android:textColor="#FFFFFF"
-                    android:textSize="18sp"
-                    android:textStyle="bold"
-                    android:translationX="5dp" />
-
-                <ImageButton
-                    android:id="@+id/resetUserPrompt"
-                    android:layout_width="wrap_content"
-                    android:layout_height="wrap_content"
-                    android:layout_marginTop="10dp"
-                    android:backgroundTint="@android:color/transparent"
-                    android:src="@drawable/baseline_restart_alt_24" />
-
-            </LinearLayout>
-
-            <EditText
-                android:id="@+id/userPromptText"
-                android:layout_width="match_parent"
-                android:layout_height="69dp"
-                android:background="@drawable/prompt_shape"
-                android:text="USER_PROMPT tags"
-                android:textColor="#FFFFFF"
-                android:textSize="16sp" />
-        </LinearLayout>
-
-        <Button
-            android:id="@+id/clearChatButton"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_gravity="center"
-            android:text="Clear Chat History"
-            android:textColor="@android:color/white"
-            android:theme="@style/DefaultButton" />
-
-    </LinearLayout>
-
-
-</androidx.constraintlayout.widget.ConstraintLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml
deleted file mode 100644
index 3f80f58db6a..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/logs_message.xml
+++ /dev/null
@@ -1,16 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="wrap_content"
-    android:layout_marginTop="10dp">
-
-        <TextView
-            android:id="@+id/logsTextView"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:padding="8dp"
-            android:text="TextView" />
-
-</LinearLayout>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
deleted file mode 100644
index bffedf30c87..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/received_message.xml
+++ /dev/null
@@ -1,70 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
-    android:layout_height="wrap_content"
-    android:paddingVertical="10dp"
-    android:paddingLeft="15dp"
-    android:paddingRight="60dp"
-    android:clipToPadding="false">
-
-    <TextView
-        android:id="@+id/name"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_marginLeft="15dp"
-        android:paddingBottom="4dp"
-        android:text="Llama"
-        android:textColor="#FFFFFF" />
-
-    <TextView
-        android:id="@+id/message_text"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_below="@+id/name"
-        android:layout_alignLeft="@+id/name"
-        android:background="@drawable/received_message"
-        android:elevation="2dp"
-        android:paddingHorizontal="16dp"
-        android:paddingVertical="12dp"
-        android:text="Generated text"
-        android:textColor="#FFFFFF"
-        android:textSize="16sp" />
-
-    <LinearLayout
-        android:id="@+id/subtitles"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_below="@+id/message_text">
-
-        <TextView
-            android:id="@+id/timestamp"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="15dp"
-            android:paddingLeft="4dp"
-            android:paddingBottom="4dp"
-            android:text=""
-            android:textColor="#FFFFFF" />
-
-        <TextView
-            android:id="@+id/bar"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="15dp"
-            android:paddingLeft="4dp"
-            android:paddingBottom="4dp"
-            android:text="|"
-            android:textColor="#FFFFFF"
-            android:visibility="gone" />
-
-        <TextView
-            android:id="@+id/generation_metrics"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="15dp"
-            android:layout_toRightOf="@+id/bar"
-            android:paddingBottom="4dp"
-            android:text=""
-            android:textColor="#FDFDFD" />
-    </LinearLayout>
-</RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
deleted file mode 100644
index a04254e38a3..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/sent_message.xml
+++ /dev/null
@@ -1,63 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="wrap_content"
-    android:paddingVertical="10dp"
-    android:paddingRight="15dp"
-    android:paddingLeft="60dp"
-    android:clipToPadding="false">
-
-    <LinearLayout
-        android:id="@+id/message_content"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_alignParentRight="true"
-        android:orientation="vertical">
-
-        <TextView
-            android:id="@+id/name"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_above="@+id/message_text"
-            android:layout_alignParentTop="true"
-            android:layout_alignParentRight="true"
-            android:layout_marginRight="15dp"
-            android:paddingBottom="4dp"
-            android:textColor="#FFFFFF" />
-
-        <TextView
-            android:id="@+id/message_text"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_alignParentRight="true"
-            android:background="@drawable/sent_message"
-            android:elevation="2dp"
-            android:padding="10dp"
-            android:text="My prompt"
-            android:textColor="#fff"
-            android:textSize="16sp" />
-
-        <ImageView
-            android:id="@+id/message_image"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_alignParentEnd="true"
-            android:adjustViewBounds="true"
-            tools:srcCompat="@tools:sample/avatars" />
-
-    </LinearLayout>
-
-    <TextView
-        android:id="@+id/timestamp"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_below="@+id/message_content"
-        android:layout_alignParentRight="true"
-        android:layout_marginRight="10dp"
-        android:paddingBottom="4dp"
-        android:text=""
-        android:textColor="#FFFFFF" />
-
-</RelativeLayout>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml
deleted file mode 100644
index bd3cfef2288..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/layout/system_message.xml
+++ /dev/null
@@ -1,23 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
-    android:layout_height="wrap_content"
-    android:paddingVertical="10dp"
-    android:paddingLeft="15dp"
-    android:paddingRight="60dp"
-    android:clipToPadding="false">
-
-    <TextView
-        android:id="@+id/message_text"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_centerHorizontal="true"
-        android:elevation="2dp"
-        android:paddingHorizontal="16dp"
-        android:paddingVertical="12dp"
-        android:text="Generated text"
-        android:textAlignment="center"
-        android:textColor="#9C9C9C"
-        android:textSize="15dp" />
-
-</RelativeLayout>
\ No newline at end of file
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
deleted file mode 100644
index b3e26b4c60c..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
deleted file mode 100644
index b3e26b4c60c..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-hdpi/ic_launcher.webp
deleted file mode 100644
index c209e78ecd3..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-hdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
deleted file mode 100644
index b2dfe3d1ba5..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-mdpi/ic_launcher.webp
deleted file mode 100644
index 4f0f1d64e58..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-mdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
deleted file mode 100644
index 62b611da081..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
deleted file mode 100644
index 948a3070fe3..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xhdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
deleted file mode 100644
index 1b9a6956b3a..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
deleted file mode 100644
index 28d4b77f9f0..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
deleted file mode 100644
index 9287f508362..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
deleted file mode 100644
index aa7d6427e6f..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
deleted file mode 100644
index 9126ae37cbc..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
deleted file mode 100644
index 069727f3eb4..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/colors.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <color name="colorPrimary">#4294F0</color>
-    <color name="colorPrimaryDark">#3700B3</color>
-    <color name="colorAccent">#03DAC5</color>
-    <color name="btn_enabled">#007CBA</color>
-    <color name="btn_disabled">#A2A4B6</color>
-    <color name="nav_bar">#16293D</color>
-    <color name="status_bar">#16293D</color>
-</resources>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml
deleted file mode 100644
index 93eac791b7a..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/strings.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<resources>
-    <string name="app_name">ExecuTorchLlamaDemo</string>
-    <string name="demo_pref_file_key">DemoPrefFileKey</string>
-    <string name="saved_messages_json_key">SavedMessagesJsonKey</string>
-    <string name="settings_json_key">SettingsJsonKey</string>
-    <string name="logs_json_key">LogsJsonKey</string>
-</resources>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml
deleted file mode 100644
index 387804aa1cc..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/styles.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<resources>
-    <!-- Base application theme. -->
-    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
-        <!-- Customize your theme here. -->
-        <item name="colorPrimary">@color/colorPrimary</item>
-        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
-        <item name="colorAccent">@color/colorAccent</item>
-    </style>
-
-    <style name="DefaultButton" parent="Theme.AppCompat.Light.DarkActionBar">
-        <item name="colorButtonNormal">@drawable/btn</item>
-        <item name="android:textColor">@color/colorPrimaryDark</item>
-    </style>
-</resources>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/themes.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/themes.xml
deleted file mode 100644
index 9730444dd18..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/values/themes.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <style name="Theme.ExecuTorchLlamaDemo" parent="android:Theme.Light" />
-</resources>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/xml/backup_rules.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/xml/backup_rules.xml
deleted file mode 100644
index 148c18b6593..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/xml/backup_rules.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample backup rules file; uncomment and customize as necessary.
-   See https://developer.android.com/guide/topics/data/autobackup
-   for details.
-   Note: This file is ignored for devices older that API 31
-   See https://developer.android.com/about/versions/12/backup-restore
--->
-<full-backup-content>
-    <!--
-   <include domain="sharedpref" path="."/>
-   <exclude domain="sharedpref" path="device.xml"/>
--->
-</full-backup-content>
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/res/xml/data_extraction_rules.xml b/examples/demo-apps/android/LlamaDemo/app/src/main/res/xml/data_extraction_rules.xml
deleted file mode 100644
index 0c4f95cab91..00000000000
--- a/examples/demo-apps/android/LlamaDemo/app/src/main/res/xml/data_extraction_rules.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample data extraction rules file; uncomment and customize as necessary.
-   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
-   for details.
--->
-<data-extraction-rules>
-    <cloud-backup>
-        <!-- TODO: Use <include> and <exclude> to control what is backed up.
-        <include .../>
-        <exclude .../>
-        -->
-    </cloud-backup>
-    <!--
-    <device-transfer>
-        <include .../>
-        <exclude .../>
-    </device-transfer>
-    -->
-</data-extraction-rules>
diff --git a/examples/demo-apps/android/LlamaDemo/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/build.gradle.kts
deleted file mode 100644
index 568efa2815b..00000000000
--- a/examples/demo-apps/android/LlamaDemo/build.gradle.kts
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-plugins {
-  id("com.android.application") version "8.1.0" apply false
-  id("org.jetbrains.kotlin.android") version "1.8.10" apply false
-}
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
deleted file mode 100644
index f72e1b0fbc7..00000000000
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
+++ /dev/null
@@ -1,185 +0,0 @@
-# Building ExecuTorch Android Demo for Llama running MediaTek
-This tutorial covers the end to end workflow for running Llama 3-8B-instruct inference on MediaTek AI accelerators on an Android device.
-More specifically, it covers:
-1. Export and quantization of Llama models against the MediaTek backend.
-2. Building and linking libraries that are required to inference on-device for Android platform using MediaTek AI accelerators.
-3. Loading the needed model files on the device and using the Android demo app to run inference.
-
-Verified on MacOS, Linux CentOS (model export), Python 3.10, Android NDK 26.3.11579264
-Phone verified: MediaTek Dimensity 9300 (D9300) chip.
-
-## Prerequisites
-* Download and link the Buck2 build, Android NDK, and MediaTek ExecuTorch Libraries from the MediaTek Backend Readme ([link](https://github.com/pytorch/executorch/tree/main/backends/mediatek/scripts#prerequisites)).
-* MediaTek Dimensity 9300 (D9300) chip device
-* Desired Llama 3 model weights. You can download them on HuggingFace [Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)).
-* Download NeuroPilot Express SDK from the [MediaTek NeuroPilot Portal](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress):
-  - `libneuronusdk_adapter.mtk.so`: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip.
-  - `libneuron_buffer_allocator.so`: This utility library is designed for allocating DMA buffers necessary for model inference.
-  - `mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`: This library preprocess the model into a MediaTek representation.
-  - `mtk_neuron-8.2.2-py3-none-linux_x86_64.whl`: This library converts the model to binaries.
-
-## Setup ExecuTorch
-In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
-
-Checkout ExecuTorch repo and sync submodules
-
-```
-git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
-```
-
-Create either a Python virtual environment:
-
-```
-python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
-```
-
-Or a Conda environment:
-
-```
-conda create -n et_xnnpack python=3.10.0 && conda activate et_xnnpack
-```
-
-Install dependencies
-```
-./install_executorch.sh
-```
-
-## Setup Environment Variables
-### Download Buck2 and make executable
-* Download Buck2 from the official [Release Page](https://github.com/facebook/buck2/releases/tag/2024-02-01)
-* Create buck2 executable
-```
-zstd -cdq "<downloaded_buck2_file>.zst" > "<path_to_store_buck2>/buck2" && chmod +x "<path_to_store_buck2>/buck2"
-```
-
-### Set Environment Variables
-```
-export ANDROID_NDK=path_to_android_ndk
-export NEURON_BUFFER_ALLOCATOR_LIB=path_to_buffer_allocator/libneuron_buffer_allocator.so
-export NEURON_USDK_ADAPTER_LIB=path_to_usdk_adapter/libneuronusdk_adapter.mtk.so
-export ANDROID_ABIS=arm64-v8a
-```
-
-## Export Llama Model
-MTK currently supports Llama 3 exporting.
-
-### Set up Environment
-1. Follow the ExecuTorch set-up environment instructions found on the [Getting Started](https://pytorch.org/executorch/main/getting-started-setup.html) page
-2. Set-up MTK AoT environment
-```
-// Ensure that you are inside executorch/examples/mediatek directory
-pip3 install -r requirements.txt
-
-pip3 install mtk_neuron-8.2.2-py3-none-linux_x86_64.whl
-pip3 install mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
-```
-
-This was tested with transformers version 4.40 and numpy version 1.23. If you do not have these version then, use the following commands:
-```
-pip install transformers==4.40
-
-pip install numpy=1.23
-```
-
-### Running Export
-Prior to exporting, place the config.json, relevant tokenizer files and .bin or .safetensor weight files in `examples/mediatek/models/llm_models/weights`.
-
-Here is an export example ([details](https://github.com/pytorch/executorch/tree/main/examples/mediatek#aot-flow)):
-```
-cd examples/mediatek
-# num_chunks=4, num_tokens=128, cache_size=512
-source shell_scripts/export_llama.sh llama3 "" "" "" alpaca.txt
-```
-
-There will be 3 main set of files generated:
-* num_chunks*2 pte files: half are for prompt and the other half are for generation. Generation pte files are denoted by “1t” in the file name.
-* Token embedding bin file: located in the weights folder where `config.json` is placed (`examples/mediatek/modes/llm_models/weight/<model_name>/embedding_<model_name>_fp32.bin`)
-* Tokenizer file: `tokenizer.model` file
-
-Note: Exporting model flow can take 2.5 hours (114GB RAM for num_chunks=4) to complete. (Results may vary depending on hardware)
-
-Before continuing forward, make sure to modify the tokenizer, token embedding, and model paths in the  examples/mediatek/executor_runner/run_llama3_sample.sh.
-
-### Deploy
-First, make sure your Android phone’s chipset version is compatible with this demo (MediaTek Dimensity 9300 (D9300)) chip. Once you have the model, tokenizer, and runner generated ready, you can push them and the .so files to the device before we start running using the runner via shell.
-
-```
-adb shell mkdir -p /data/local/tmp/et-mtk/ (or any other directory name)
-adb push embedding_<model_name>_fp32.bin /data/local/tmp/et-mtk
-adb push tokenizer.model /data/local/tmp/et-mtk
-adb push <exported_prompt_model_0>.pte /data/local/tmp/et-mtk
-adb push <exported_prompt_model_1>.pte /data/local/tmp/et-mtk
-...
-adb push <exported_prompt_model_n>.pte /data/local/tmp/et-mtk
-adb push <exported_gen_model_0>.pte /data/local/tmp/et-mtk
-adb push <exported_gen_model_1>.pte /data/local/tmp/et-mtk
-...
-adb push <exported_gen_model_n>.pte /data/local/tmp/et-mtk
-```
-
-## Populate Model Paths in Runner
-
-The Mediatek runner (`examples/mediatek/executor_runner/mtk_llama_runner.cpp`) contains the logic for implementing the function calls that come from the Android app.
-
-**Important!** Currently the model paths are set in the runner-level. Modify the values in `examples/mediatek/executor_runner/llama_runner/llm_helper/include/llama_runner_values.h` to set the model paths, tokenizer path, embedding file path, and other metadata.
-
-
-## Build AAR Library
-1. Open a terminal window and navigate to the root directory of the executorch
-2. Set the following environment variables:
-```sh
-export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABIS=arm64-v8a
-export NEURON_BUFFER_ALLOCATOR_LIB=<path_to_neuron_buffer_allocator_lib>
-```
-*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
-
-3. Create a directory to hold the AAR
-```sh
-mkdir -p aar-out
-export BUILD_AAR_DIR=aar-out
-```
-
-4. Run the following command to build the AAR:
-```sh
-sh scripts/build_android_library.sh
-```
-
-5. Copy the AAR to the app:
-```sh
-mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
-cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs/executorch.aar
-```
-
-If you were to unzip the .aar file or open it in Android Studio, verify it contains the following related to MediaTek backend:
-* libneuron_buffer_allocator.so
-* libneuronusdk_adapter.mtk.so
-* libneuron_backend.so (generated during build)
-
-## Run Demo
-
-### Alternative 1: Android Studio (Recommended)
-1. Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
-2. Run the app (^R). This builds and launches the app on the phone.
-
-### Alternative 2: Command line
-Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
-```
-export ANDROID_HOME=<path_to_android_sdk_home>
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:installDebug
-popd
-```
-If the app successfully run on your device, you should see something like below:
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
-</p>
-
-Once you've loaded the app on the device:
-1. Click on the Settings in the app
-2. Select MediaTek from the Backend dropdown
-3. Click the "Load Model" button. This will load the models from the Runner
-
-## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
deleted file mode 100644
index c1a93d02f93..00000000000
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
+++ /dev/null
@@ -1,243 +0,0 @@
-# Building ExecuTorch Android Demo App for Llama running Qualcomm
-
-This tutorial covers the end to end workflow for building an android demo app using Qualcomm AI accelerators on device.
-More specifically, it covers:
-1. Export and quantization of Llama models against the Qualcomm backend.
-2. Building and linking libraries that are required to inference on-device for Android platform using Qualcomm AI accelerators.
-3. Building the Android demo app itself.
-
-Verified on Linux CentOS, QNN SDK [v2.26](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip), python 3.10, Android SDK r27b.
-
-Phone verified: OnePlus 12, Samsung 24+, Samsung 23
-
-## Prerequisites
-* Download and unzip QNN SDK [v2.26](https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.26.0.240828.zip)
-* Download and unzip Android SDK [r27b](https://developer.android.com/ndk/downloads)
-* Android phone with Snapdragon8 Gen3 (SM8650) or Gen2 (SM8550). Gen 1 and lower SoC might be supported but not fully validated.
-* Desired Llama model weights in .PTH format. You can download them on HuggingFace ([Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)).
-
-## Setup ExecuTorch
-In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
-
-Checkout ExecuTorch repo and sync submodules
-
-```
-git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
-```
-
-Create either a Python virtual environment:
-
-```
-python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
-```
-
-Or a Conda environment:
-
-```
-conda create -n et_xnnpack python=3.10.0 && conda activate et_xnnpack
-```
-
-Install dependencies
-```
-./install_executorch.sh
-```
-
-## Setup QNN
-```
-# Set these variables correctly for your environment
-export ANDROID_NDK_ROOT=$HOME/android-ndk-r27b # Download android SDK and unzip to home directory
-export QNN_SDK_ROOT=$HOME/Your-SDK-Root #Folder contains lib
-export EXECUTORCH_ROOT=$HOME/repos/executorch
-export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH
-export PYTHONPATH=$EXECUTORCH_ROOT/..
-cp schema/program.fbs exir/_serialize/program.fbs
-cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
-```
-
-### Build QNN backend with ExecuTorch
-```
-./backends/qualcomm/scripts/build.sh --release
-
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DEXECUTORCH_ENABLE_LOGGING=1 \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -Bcmake-out .
-cmake --build cmake-out -j16 --target install --config Release
-```
-
-
-
-### Setup Llama Runner
-Next we need to build and compile the Llama runner. This is similar to the requirements for running Llama with XNNPACK.
-```
-./examples/models/llama/install_requirements.sh
-
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_QNN=ON \
-    -Bcmake-out/examples/models/llama \
-    examples/models/llama
-cmake --build cmake-out/examples/models/llama -j16 --config Release
-```
-
-## Export Llama Model
-QNN backend currently supports exporting to these data types: fp32, int4/ int8 with PTQ, int4 with SpinQuant (Llama 3 only).
-
-We also support export for different Qualcomm SoC. We have verified SM8650(V75) and SM8550(V73). To export for different SoC, add "--soc_model SM8550" in your export command. Without setting this flag, the export will default to SM8650.
-
-### Export with PTQ
-We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B). However, there is accuracy regression and we are working on improving it.
-8B models might need 16GB RAM on the device to run.
-
-Examples:
-```
-# 4 bits weight only quantize
-python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte"
-```
-If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example:
-```
-# 8 bits quantization with 4 shards
-python -m extension.llm.export.export_llm base.checkpoint="${MODEL_DIR}/consolidated.00.pth" base.params="${MODEL_DIR}/params.json" model.use_kv_cache=True model.enable_dynamic_shape=False backend.qnn.enabled=True backend.qnn.quantization="qnn_8a8w" model.dtype_override="fp32" backend.qnn.num_sharding=4 base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="test.pte"
-```
-Note: if you encountered issues below
-```
-[ERROR] [Qnn ExecuTorch]: Cannot Open QNN library libQnnHtp.so, with error: libc++.so.1: cannot open shared object file: No such file or directory
-```
-
-Resolve by:
-
-* Install older QNN such as 2.23 or below and copy it from ${QNN_SDK_ROOT}/lib/x86_64-linux-clang
-* Install it with apt-get by yourself
-* Install it with script in ${QNN_SDK_ROOT}/bin/check-linux-dependency.sh
-You could refer to [QNN SDK document](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/setup.html?product=1601111740009302#linux-platform-dependencies)
-* Install it with Conda:
-```
-conda install -c conda-forge libcxx=14.0.0
-```
-
-After installment, you will need to check libc++.so.1 in your LD_LIBRARY_PATH or system lib. Refer to this [PR](https://github.com/pytorch/executorch/issues/5120) for more detail.
-
-You may also wonder what the "--metadata" flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
-
-Convert tokenizer for Llama 2
-```
-python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
-```
-Rename tokenizer for Llama 3 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
-
-
-### Export with Spinquant (Llama 3 8B only)
-We also support Llama 3 8B for Spinquant where the accuracy regression is minimal.
-
-Deploying large language models like Llama 3 on-device presents the following challenges:
-* The model size is too large to fit in device memory for inference.
-* High model loading and inference time.
-* Difficulty in quantization.
-
-To address these challenges, we have implemented the following solutions:
-* Using --pt2e_quantize qnn_16a4w to quantize activations and weights, thereby reducing the on-disk model size and alleviating memory pressure during inference.
-* Using --num_sharding 8 to shard the model into sub-parts.
-* Performing graph transformations to convert or decompose operations into more accelerator-friendly operations.
-* Using --optimized_rotation_path <path_to_optimized_matrix> to apply R1 and R2 of [Spin Quant](https://github.com/facebookresearch/SpinQuant) to improve accuracy.
-* Using --calibration_data "<|start_header_id|>system<|end_header_id|..." to ensure that during the quantization of Llama 3 8B Instruct, the calibration includes special tokens in the prompt template. For more details on the prompt template, refer to the [model card](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/) of meta llama3 instruct.
-
-To get the optimized matrix, please refer to [SpinQuant](https://github.com/facebookresearch/SpinQuant) on GitHub. You can download the optimized rotation matrices in the Quantized Models section. Please choose "LLaMA-3-8B/8B_W4A16KV16_lr_1.5_seed_0".
-
-To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure the following:
-* The host machine has more than 100GB of memory (RAM + swap space).
-* The entire process takes a few hours.
-* 8B models might need 16GB RAM on the device to run.
-```
-# Please note that calibration_data must include the prompt template for special tokens.
-python -m extension.llm.export.export_llm base.tokenizer=<path_to_tokenizer.model> base.params=<path_to_params.json> base.checkpoint=<path_to_checkpoint_for_Meta-Llama-3-8B-Instruct> model.use_kv_cache=True backend.qnn.enabled=True backend.qnn.quantization="qnn_16a4w" model.enable_dynamic_shape=False backend.qnn.num_sharding=8 backend.qnn.calibration_tasks="wikitext" backend.qnn.calibration_limit=1 backend.qnn.calibration_seq_length=128 backend.qnn.optimized_rotation_path=<path_to_optimized_matrix> backend.qnn.calibration_data="<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-```
-
-## Pushing Model and Tokenizer
-
-Once you have the model and tokenizer ready, you can push them to the device before we start building the android demo app.
-```
-adb shell mkdir -p /data/local/tmp/llama
-adb push llama-exported.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-```
-
-
-
-## Build AAR Library
-1. Open a terminal window and navigate to the root directory of the executorch
-Set the following environment variables:
-```sh
-export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABIS=arm64-v8a
-export QNN_SDK_ROOT=<path_to_qnn_sdk>
-```
-
-*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
-
-3. Create a directory to hold the AAR
-```sh
-mkdir -p aar-out
-export BUILD_AAR_DIR=aar-out
-```
-
-4. Run the following command to build the AAR:
-```sh
-sh scripts/build_android_library.sh
-```
-
-5. Copy the AAR to the app:
-```sh
-mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
-cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs/executorch.aar
-```
-
-Alternative you can also just run the shell script directly as in the root directory:
-```sh
-sh examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
-```
-This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app.
-Note: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting for QNN backend on Linux), make sure you copy the aar file generated from setup-with-qnn script to "examples/demo-apps/android/LlamaDemo/app/libs" before building the Android app.
-
-6. Set up the correct QNN version in gradle rule
-Currently, the gralde rule searches for the property `qnnVersion`. When this variable is defined, it will add QNN runtime library to the dependency. To use it, append the string `qnnVersion=<version>` (ex. `qnnVersion=2.37.0`) to the end of the `gradle.properties` file.
-
-## Run the Android Demo App
-
-First, make sure your Android phone's chipset version is compatible with this demo (SM8650, SM8550). You can find the Qualcomm chipset version here in the [mapping](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html).
-
-If you build and run the setup-with-qnn script on a separate machine rather than where you are building the Android app, make sure you copy the aar file it generated into "examples/demo-apps/android/LlamaDemo/app/libs"
-
-### Alternative 1: Android Studio (Recommended)
-Open Android Studio and select "Open an existing Android Studio project" to open examples/demo-apps/android/LlamaDemo.
-Run the app (^R). This builds and launches the app on the phone.
-
-### Alternative 2: Command line
-Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
-```
-export ANDROID_HOME=<path_to_android_sdk_home>
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:installDebug
-popd
-```
-If the app successfully run on your device, you should see something like below:
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
-</p>
-
-## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on Github.
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
deleted file mode 100644
index 3ec0cd5cf49..00000000000
--- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
+++ /dev/null
@@ -1,199 +0,0 @@
-# Building ExecuTorch Android Demo App for Llama/Llava running XNNPACK
-This tutorial covers the end to end workflow for building an android demo app using CPU on device via XNNPACK framework.
-More specifically, it covers:
-1. Export and quantization of Llama and Llava models against the XNNPACK backend.
-2. Building and linking libraries that are required to inference on-device for Android platform.
-3. Building the Android demo app itself.
-
-Phone verified: OnePlus 12, OnePlus 9 Pro. Samsung S23 (Llama only), Samsung S24+ (Llama only), Pixel 8 Pro (Llama only)
-
-## Prerequisites
-* Install [Java 17 JDK](https://www.oracle.com/java/technologies/javase/jdk17-archive-downloads.html).
-* Install the [Android SDK API Level 34](https://developer.android.com/about/versions/15/setup-sdk) and [Android NDK r27b](https://github.com/android/ndk/releases/tag/r27b).
-  * Note: This demo app and tutorial has only been validated with arm64-v8a [ABI](https://developer.android.com/ndk/guides/abis), with NDK 26.3.11579264 and r27b.
-* If you have Android Studio set up, you can install them with
-  * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Platforms -> Check the row with API Level 34.
-  * Android Studio Settings -> Language & Frameworks -> Android SDK -> SDK Tools -> Check NDK (Side by side) row.
-* Alternatively, you can follow [this guide](https://github.com/pytorch/executorch/blob/856e085b9344c8b0bf220a97976140a5b76356aa/examples/demo-apps/android/LlamaDemo/SDK.md) to set up Java/SDK/NDK with CLI.
-* Supported Host OS: CentOS, macOS Sonoma on Apple Silicon.
-
-
-## Setup ExecuTorch
-In this section, we will need to set up the ExecuTorch repo first with Conda environment management. Make sure you have Conda available in your system (or follow the instructions to install it [here](https://anaconda.org/anaconda/conda)). The commands below are running on Linux (CentOS).
-
-Checkout ExecuTorch repo and sync submodules
-
-```
-git clone -b viable/strict https://github.com/pytorch/executorch.git && cd executorch
-```
-
-Create either a Python virtual environment:
-
-```
-python3 -m venv .venv && source .venv/bin/activate && pip install --upgrade pip
-```
-
-Or a Conda environment:
-
-```
-conda create -n et_xnnpack python=3.10.0 && conda activate et_xnnpack
-```
-
-Install dependencies
-```
-./install_executorch.sh
-```
-
-## Prepare Models
-In this demo app, we support text-only inference with up-to-date Llama models and image reasoning inference with LLaVA 1.5.
-* You can request and download model weights for Llama through Meta official [website](https://llama.meta.com/).
-* For chat use-cases, download the instruct models instead of pretrained.
-* Run `./examples/models/llama/install_requirements.sh` to install dependencies.
-* Rename tokenizer for Llama3.x with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
-
-### For Llama 3.2 1B and 3B SpinQuant models
-Meta has released prequantized INT4 SpinQuant Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
-* Export Llama model and generate .pte file as below:
-```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint=<path-to-your-checkpoint.pth> base.params=<path-to-your-params.json> model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="preq_8da4w_out_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' quantization.use_spin_quant="native" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_spinquant.pte"
-```
-For convenience, an [exported ExecuTorch SpinQuant model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_SpinQuant_INT4_EO8.ipynb).
-
-### For Llama 3.2 1B and 3B QAT+LoRA models
-Meta has released prequantized INT4 QAT+LoRA Llama 3.2 models that ExecuTorch supports on the XNNPACK backend.
-* Export Llama model and generate .pte file as below:
-```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint=<path-to-your-checkpoint.pth> base.params=<path-to-your-params.json> quantization.use_qat=True base.use_lora=16 model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="fp32" backend.xnnpack.extended_ops=True base.preq_mode="preq_8da4w_out_8da8w" base.preq_group_size=32 export.max_seq_length=2048 export.max_context_length=2048 base.preq_embedding_quantize=\'8,0\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_qat_lora.pte"
-```
-For convenience, an [exported ExecuTorch QAT+LoRA model](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Llama-3.2-1B-Instruct-QLORA_INT4_EO8.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-Instruct-QLORA_INT4_EO8-ET/blob/main/Export_Recipe_Llama_3_2_1B_Instruct_QLORA_INT4_EO8.ipynb).
-
-
-### For Llama 3.2 1B and 3B BF16 models
-We have supported BF16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models.
-* The 1B model in BF16 format can run on mobile devices with 8GB RAM. The 3B model will require 12GB+ RAM.
-* Export Llama model and generate .pte file as below:
-
-```
-python -m extension.llm.export.export_llm base.model_class="llama3_2" base.checkpoint=<path-to-your-checkpoint.pth> base.params=<path-to-your-params.json> model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True model.dtype_override="bf16" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama3_2_bf16.pte"
-```
-For convenience, an [exported ExecuTorch bf16 model](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/llama3_2-1B.pte) is available on Hugging Face. The export was created using [this detailed recipe notebook](https://huggingface.co/executorch-community/Llama-3.2-1B-ET/blob/main/ExportRecipe_1B.ipynb).
-
-For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-).
-
-
-### For Llama Guard 1B models
-To safeguard your application, you can use our Llama Guard models for prompt classification or response classification as mentioned [here](https://www.llama.com/docs/model-cards-and-prompt-formats/llama-guard-3/).
-* Llama Guard 3-1B is a fine-tuned Llama-3.2-1B pretrained model for content safety classification. It is aligned to safeguard against the [MLCommons standardized hazards taxonomy](https://arxiv.org/abs/2404.12241).
-* You can download the latest Llama Guard 1B INT4 model, which is already exported for ExecuTorch, using instructions from [here](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard3). This model is pruned and quantized to 4-bit weights using 8da4w mode and reduced the size to <450MB to optimize deployment on edge devices.
-* You can use the same tokenizer from Llama 3.2.
-* To try this model, choose Model Type as LLAMA_GUARD_3 in the demo app below and try prompt classification for a given user prompt.
-* We prepared this model using the following command
-
-```
-python -m extension.llm.export.export_llm base.checkpoint=<path-to-pruned-llama-guard-1b-checkpoint.pth> base.params=<path-to-your-params.json> model.dtype_override="fp32" model.use_kv_cache=True model.use_sdpa_with_kv_cache=True quantization.qmode="8da4w" quantization.group_size=256 backend.xnnpack.enabled=True export.max_seq_length=8193 export.max_context_length=8193 quantization.embedding_quantize=\'4,32\' base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' base.output_prune_map=<path-to-your-llama_guard-pruned-layers-map.json> export.output_name="llama_guard_3_1b_pruned_xnnpack.pte"
-```
-
-
-### For Llama 3.1 and Llama 2 models
-* For Llama 2 models, Edit params.json file. Replace "vocab_size": -1 with "vocab_size": 32000. This is a short-term workaround.
-* The Llama 3.1 and Llama 2 models (8B and 7B) can run on devices with 12GB+ RAM.
-* Export Llama model and generate .pte file as below:
-
-```
-python -m extension.llm.export.export_llm base.checkpoint=<path-to-your-checkpoint.pth> base.params=<path-to-your-params.json> model.use_kv_cache=True model.use_sdpa_with_kv_cache=True backend.xnnpack.enabled=True quantization.qmode="8da4w" quantization.group_size=128 model.dtype_override="fp32" base.metadata='"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"' export.output_name="llama.pte"
-```
-
-You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily.
-
-* Convert tokenizer for Llama 2 and Llava (skip this for Llama 3.x)
-```
-python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
-```
-
-### For LLaVA model
-* For the Llava 1.5 model, you can get it from Huggingface [here](https://huggingface.co/llava-hf/llava-1.5-7b-hf).
-* Run `examples/models/llava/install_requirements.sh` to install dependencies.
-* Run the following command to generate llava.pte, tokenizer.bin and download an image basketball.jpg.
-
-```
-python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
-```
-* You can find more information [here](https://github.com/pytorch/executorch/tree/main/examples/models/llava).
-
-
-## Pushing Model and Tokenizer
-Once you have the model and tokenizer ready, you can push them to the device before we start building the Android demo app.
-```
-adb shell mkdir -p /data/local/tmp/llama
-adb push llama.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-```
-
-## Build AAR Library
-1. Open a terminal window and navigate to the root directory of the executorch
-2. Set the following environment variables:
-```sh
-export ANDROID_NDK=<path_to_android_ndk>
-export ANDROID_ABIS=arm64-v8a
-```
-*Note: <path_to_android_ndk> is the root for the NDK, which is usually under ~/Library/Android/sdk/ndk/XX.Y.ZZZZZ for macOS, and contains NOTICE and README.md. We use <path_to_android_ndk>/build/cmake/android.toolchain.cmake for CMake to cross-compile.*
-
-3. Create a directory to hold the AAR
-```sh
-mkdir -p aar-out
-export BUILD_AAR_DIR=aar-out
-```
-
-4. Run the following command to build the AAR:
-```sh
-sh scripts/build_android_library.sh
-```
-
-5. Copy the AAR to the app:
-```sh
-mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
-cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs/executorch.aar
-```
-
-Alternative you can also just run the shell script directly as in the root directory:
-```sh
-sh examples/demo-apps/android/LlamaDemo/setup.sh
-```
-
-This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them into AAR, and copies it to the app.
-
-**Output**: The executorch.aar file will be generated in a newly created folder in the example/demo-apps/android/LlamaDemo/app/libs directory. This is the path that the Android app expects it to be in.
-
-**Note**: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting on Linux), make sure you copy the aar file generated from setup script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app.
-
-### Alternative: Use prebuilt AAR library
-1. Open a terminal window and navigate to the root directory of the executorch.
-2. Run the following command to download the prebuilt library
-```
-bash examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
-```
-The prebuilt AAR library contains the Java library and the JNI binding for Module.java and ExecuTorch native library, including core ExecuTorch runtime libraries, XNNPACK backend, Portable kernels, Optimized kernels, and Quantized kernels. It comes with two ABI variants, arm64-v8a and x86_64.
-If you need to use other dependencies (like tokenizer), please build from the local machine option.
-
-## Run the Android Demo App
-### Alternative 1: Android Studio (Recommended)
-1. Open Android Studio and select “Open an existing Android Studio project” to open examples/demo-apps/android/LlamaDemo.
-2. Run the app (^R). This builds and launches the app on the phone.
-
-### Alternative 2: Command line
-Without Android Studio UI, we can run gradle directly to build the app. We need to set up the Android SDK path and invoke gradle.
-```
-export ANDROID_SDK=<path_to_android_sdk_home>
-export ANDROID_HOME=<path_to_android_sdk_home>
-pushd examples/demo-apps/android/LlamaDemo
-./gradlew :app:installDebug
-popd
-```
-If the app successfully run on your device, you should see something like below:
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/pytorch/executorch/refs/heads/main/docs/source/_static/img/opening_the_app_details.png" style="width:800px">
-</p>
-
-## Reporting Issues
-If you encountered any bugs or issues following this tutorial please file a bug/issue here on [Github](https://github.com/pytorch/executorch/issues/new).
diff --git a/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh b/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
deleted file mode 100644
index 215bccea8f9..00000000000
--- a/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -eu
-
-AAR_URL="https://ossci-android.s3.us-west-1.amazonaws.com/executorch/release/executorch-241002/executorch.aar"
-AAR_SHASUM_URL="https://ossci-android.s3.us-west-1.amazonaws.com/executorch/release/executorch-241002/executorch.aar.sha256sums"
-LIBS_PATH="$(dirname "$0")/app/libs"
-
-mkdir -p "$LIBS_PATH"
-
-pushd "$LIBS_PATH"
-curl -O "${AAR_SHASUM_URL}"
-shasum --check --status executorch.aar.sha256sums || curl "${AAR_URL}" -o executorch.aar
-popd
diff --git a/examples/demo-apps/android/LlamaDemo/gradle.properties b/examples/demo-apps/android/LlamaDemo/gradle.properties
deleted file mode 100644
index 2cbd6d19d33..00000000000
--- a/examples/demo-apps/android/LlamaDemo/gradle.properties
+++ /dev/null
@@ -1,23 +0,0 @@
-# Project-wide Gradle settings.
-# IDE (e.g. Android Studio) users:
-# Gradle settings configured through the IDE *will override*
-# any settings specified in this file.
-# For more details on how to configure your build environment visit
-# http://www.gradle.org/docs/current/userguide/build_environment.html
-# Specifies the JVM arguments used for the daemon process.
-# The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
-# When configured, Gradle will run in incubating parallel mode.
-# This option should only be used with decoupled projects. More details, visit
-# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
-# org.gradle.parallel=true
-# AndroidX package structure to make it clearer which packages are bundled with the
-# Android operating system, and which are packaged with your app's APK
-# https://developer.android.com/topic/libraries/support-library/androidx-rn
-android.useAndroidX=true
-# Kotlin code style for this project: "official" or "obsolete":
-kotlin.code.style=official
-# Enables namespacing of each library's R class so that its R class includes only the
-# resources declared in the library itself and none from the library's dependencies,
-# thereby reducing the size of the R class for that library
-android.nonTransitiveRClass=true
diff --git a/examples/demo-apps/android/LlamaDemo/gradle/wrapper/gradle-wrapper.jar b/examples/demo-apps/android/LlamaDemo/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index a4b76b9530d..00000000000
Binary files a/examples/demo-apps/android/LlamaDemo/gradle/wrapper/gradle-wrapper.jar and /dev/null differ
diff --git a/examples/demo-apps/android/LlamaDemo/gradle/wrapper/gradle-wrapper.properties b/examples/demo-apps/android/LlamaDemo/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index 9355b415575..00000000000
--- a/examples/demo-apps/android/LlamaDemo/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,7 +0,0 @@
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
-networkTimeout=10000
-validateDistributionUrl=true
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
diff --git a/examples/demo-apps/android/LlamaDemo/gradlew b/examples/demo-apps/android/LlamaDemo/gradlew
deleted file mode 100755
index f5feea6d6b1..00000000000
--- a/examples/demo-apps/android/LlamaDemo/gradlew
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/bin/sh
-
-#
-# Copyright © 2015-2021 the original authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-
-##############################################################################
-#
-#   Gradle start up script for POSIX generated by Gradle.
-#
-#   Important for running:
-#
-#   (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
-#       noncompliant, but you have some other compliant shell such as ksh or
-#       bash, then to run this script, type that shell name before the whole
-#       command line, like:
-#
-#           ksh Gradle
-#
-#       Busybox and similar reduced shells will NOT work, because this script
-#       requires all of these POSIX shell features:
-#         * functions;
-#         * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
-#           «${var#prefix}», «${var%suffix}», and «$( cmd )»;
-#         * compound commands having a testable exit status, especially «case»;
-#         * various built-in commands including «command», «set», and «ulimit».
-#
-#   Important for patching:
-#
-#   (2) This script targets any POSIX shell, so it avoids extensions provided
-#       by Bash, Ksh, etc; in particular arrays are avoided.
-#
-#       The "traditional" practice of packing multiple parameters into a
-#       space-separated string is a well documented source of bugs and security
-#       problems, so this is (mostly) avoided, by progressively accumulating
-#       options in "$@", and eventually passing that to Java.
-#
-#       Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
-#       and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
-#       see the in-line comments for details.
-#
-#       There are tweaks for specific operating systems such as AIX, CygWin,
-#       Darwin, MinGW, and NonStop.
-#
-#   (3) This script is generated from the Groovy template
-#       https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
-#       within the Gradle project.
-#
-#       You can find Gradle at https://github.com/gradle/gradle/.
-#
-##############################################################################
-
-# Attempt to set APP_HOME
-
-# Resolve links: $0 may be a link
-app_path=$0
-
-# Need this for daisy-chained symlinks.
-while
-    APP_HOME=${app_path%"${app_path##*/}"}  # leaves a trailing /; empty if no leading path
-    [ -h "$app_path" ]
-do
-    ls=$( ls -ld "$app_path" )
-    link=${ls#*' -> '}
-    case $link in             #(
-      /*)   app_path=$link ;; #(
-      *)    app_path=$APP_HOME$link ;;
-    esac
-done
-
-# This is normally unused
-# shellcheck disable=SC2034
-APP_BASE_NAME=${0##*/}
-# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
-APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s
-' "$PWD" ) || exit
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD=maximum
-
-warn () {
-    echo "$*"
-} >&2
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-} >&2
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "$( uname )" in                #(
-  CYGWIN* )         cygwin=true  ;; #(
-  Darwin* )         darwin=true  ;; #(
-  MSYS* | MINGW* )  msys=true    ;; #(
-  NONSTOP* )        nonstop=true ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD=$JAVA_HOME/jre/sh/java
-    else
-        JAVACMD=$JAVA_HOME/bin/java
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD=java
-    if ! command -v java >/dev/null 2>&1
-    then
-        die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-fi
-
-# Increase the maximum file descriptors if we can.
-if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
-    case $MAX_FD in #(
-      max*)
-        # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC2039,SC3045
-        MAX_FD=$( ulimit -H -n ) ||
-            warn "Could not query maximum file descriptor limit"
-    esac
-    case $MAX_FD in  #(
-      '' | soft) :;; #(
-      *)
-        # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
-        # shellcheck disable=SC2039,SC3045
-        ulimit -n "$MAX_FD" ||
-            warn "Could not set maximum file descriptor limit to $MAX_FD"
-    esac
-fi
-
-# Collect all arguments for the java command, stacking in reverse order:
-#   * args from the command line
-#   * the main class name
-#   * -classpath
-#   * -D...appname settings
-#   * --module-path (only if needed)
-#   * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
-
-# For Cygwin or MSYS, switch paths to Windows format before running java
-if "$cygwin" || "$msys" ; then
-    APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
-    CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
-
-    JAVACMD=$( cygpath --unix "$JAVACMD" )
-
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    for arg do
-        if
-            case $arg in                                #(
-              -*)   false ;;                            # don't mess with options #(
-              /?*)  t=${arg#/} t=/${t%%/*}              # looks like a POSIX filepath
-                    [ -e "$t" ] ;;                      #(
-              *)    false ;;
-            esac
-        then
-            arg=$( cygpath --path --ignore --mixed "$arg" )
-        fi
-        # Roll the args list around exactly as many times as the number of
-        # args, so each arg winds up back in the position where it started, but
-        # possibly modified.
-        #
-        # NB: a `for` loop captures its iteration list before it begins, so
-        # changing the positional parameters here affects neither the number of
-        # iterations, nor the values presented in `arg`.
-        shift                   # remove old arg
-        set -- "$@" "$arg"      # push replacement arg
-    done
-fi
-
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
-
-# Collect all arguments for the java command:
-#   * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments,
-#     and any embedded shellness will be escaped.
-#   * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be
-#     treated as '${Hostname}' itself on the command line.
-
-set -- \
-        "-Dorg.gradle.appname=$APP_BASE_NAME" \
-        -classpath "$CLASSPATH" \
-        org.gradle.wrapper.GradleWrapperMain \
-        "$@"
-
-# Stop when "xargs" is not available.
-if ! command -v xargs >/dev/null 2>&1
-then
-    die "xargs is not available"
-fi
-
-# Use "xargs" to parse quoted args.
-#
-# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
-#
-# In Bash we could simply go:
-#
-#   readarray ARGS < <( xargs -n1 <<<"$var" ) &&
-#   set -- "${ARGS[@]}" "$@"
-#
-# but POSIX shell has neither arrays nor command substitution, so instead we
-# post-process each arg (as a line of input to sed) to backslash-escape any
-# character that might be a shell metacharacter, then use eval to reverse
-# that process (while maintaining the separation between arguments), and wrap
-# the whole thing up as a single "set" statement.
-#
-# This will of course break if any of these variables contains a newline or
-# an unmatched quote.
-#
-
-eval "set -- $(
-        printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
-        xargs -n1 |
-        sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
-        tr '\n' ' '
-    )" '"$@"'
-
-exec "$JAVACMD" "$@"
diff --git a/examples/demo-apps/android/LlamaDemo/gradlew.bat b/examples/demo-apps/android/LlamaDemo/gradlew.bat
deleted file mode 100644
index 9b42019c791..00000000000
--- a/examples/demo-apps/android/LlamaDemo/gradlew.bat
+++ /dev/null
@@ -1,94 +0,0 @@
-@rem
-@rem Copyright 2015 the original author or authors.
-@rem
-@rem Licensed under the Apache License, Version 2.0 (the "License");
-@rem you may not use this file except in compliance with the License.
-@rem You may obtain a copy of the License at
-@rem
-@rem      https://www.apache.org/licenses/LICENSE-2.0
-@rem
-@rem Unless required by applicable law or agreed to in writing, software
-@rem distributed under the License is distributed on an "AS IS" BASIS,
-@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@rem See the License for the specific language governing permissions and
-@rem limitations under the License.
-@rem
-@rem SPDX-License-Identifier: Apache-2.0
-@rem
-
-@if "%DEBUG%"=="" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%"=="" set DIRNAME=.
-@rem This is normally unused
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Resolve any "." and ".." in APP_HOME to make it shorter.
-for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if %ERRORLEVEL% equ 0 goto execute
-
-echo. 1>&2
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
-echo. 1>&2
-echo Please set the JAVA_HOME variable in your environment to match the 1>&2
-echo location of your Java installation. 1>&2
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto execute
-
-echo. 1>&2
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
-echo. 1>&2
-echo Please set the JAVA_HOME variable in your environment to match the 1>&2
-echo location of your Java installation. 1>&2
-
-goto fail
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
-
-:end
-@rem End local scope for the variables with windows NT shell
-if %ERRORLEVEL% equ 0 goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-set EXIT_CODE=%ERRORLEVEL%
-if %EXIT_CODE% equ 0 set EXIT_CODE=1
-if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE%
-exit /b %EXIT_CODE%
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
diff --git a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh b/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
deleted file mode 100644
index 8c1ad52ef8b..00000000000
--- a/examples/demo-apps/android/LlamaDemo/run_instrumentation_test.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -eu
-
-BASEDIR=$(dirname "$0")
-pushd "$BASEDIR"/../../../../
-curl -C - -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" --output stories110M.pt
-curl -C - -Ls "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" --output tokenizer.model
-# Create params.json file
-touch params.json
-echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json
-python -m extension.llm.export.export_llm base.checkpoint=stories110M.pt base.params=params.json model.dtype_override=fp16 export.output_name=stories110m_h.pte model.use_kv_cache=true
-python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
-
-adb mkdir -p /data/local/tmp/llama
-adb push stories110m_h.pte /data/local/tmp/llama
-adb push tokenizer.bin /data/local/tmp/llama
-popd
-
-pushd "$BASEDIR"
-./gradlew connectedAndroidTest
-popd
diff --git a/examples/demo-apps/android/LlamaDemo/settings.gradle.kts b/examples/demo-apps/android/LlamaDemo/settings.gradle.kts
deleted file mode 100644
index ba0e809fd98..00000000000
--- a/examples/demo-apps/android/LlamaDemo/settings.gradle.kts
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-pluginManagement {
-  repositories {
-    google()
-    mavenCentral()
-    gradlePluginPortal()
-  }
-}
-
-dependencyResolutionManagement {
-  repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
-  repositories {
-    google()
-    mavenCentral()
-  }
-}
-
-rootProject.name = "ExecuTorch Demo"
-
-include(":app")
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
deleted file mode 100644
index 0f1cde1a06f..00000000000
--- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -eu
-
-if [ -z "$QNN_SDK_ROOT" ]; then
-  echo "You must specify QNN_SDK_ROOT"
-  exit 1
-fi
-
-BASEDIR=$(dirname "$0")
-ANDROID_ABIS="arm64-v8a" bash "$BASEDIR"/setup.sh
-
-BUILD_AAR_DIR="$(mktemp -d)"
-export BUILD_AAR_DIR
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
deleted file mode 100644
index c7e3a4a95d0..00000000000
--- a/examples/demo-apps/android/LlamaDemo/setup.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -eu
-
-BUILD_AAR_DIR="$(mktemp -d)"
-export BUILD_AAR_DIR
-
-BASEDIR=$(dirname "$0")
-mkdir -p "$BASEDIR"/app/libs
-bash "$BASEDIR"/../../../../scripts/build_android_library.sh
-
-cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch.aar
diff --git a/examples/models/gemma/__init__.py b/examples/models/gemma/__init__.py
new file mode 100644
index 00000000000..13a14ff0751
--- /dev/null
+++ b/examples/models/gemma/__init__.py
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.gemma.convert_weights import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class GemmaModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "GemmaModel",
+    "convert_weights",
+]
diff --git a/examples/models/gemma/config/2b_config.json b/examples/models/gemma/config/2b_config.json
new file mode 100644
index 00000000000..20a40723c30
--- /dev/null
+++ b/examples/models/gemma/config/2b_config.json
@@ -0,0 +1,19 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 16384,
+  "n_heads": 8,
+  "head_dim": 256,
+  "n_kv_heads": 1,
+  "n_layers": 18,
+  "act_fn": "gelu",
+  "norm_type": "gemma3",
+  "norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "use_scaled_rope": false,
+  "apply_embedding": true,
+  "embedding_scale_factor": 45.254833995939045,
+  "vocab_size": 256000,
+  "use_hf_rope": true,
+  "attention_qkv_bias": false
+}
diff --git a/examples/models/gemma/convert_weights.py b/examples/models/gemma/convert_weights.py
new file mode 100644
index 00000000000..09a17bc2266
--- /dev/null
+++ b/examples/models/gemma/convert_weights.py
@@ -0,0 +1,104 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+
+# Weight mappings from Gemma's checkpoint to ExecuTorch's transformer parameters.
+_GEMMA_TO_EXECUTORCH = {
+    "model.embed_tokens.weight": "tok_embeddings.weight",
+    "model.norm.weight": "norm.weight",
+    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+    "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+    "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+    "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+    "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+}
+
+
+def gemma_to_executorch(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert the state dict so that it matches what ExecuTorch's transformer definition expects.
+    """
+    converted_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, _GEMMA_TO_EXECUTORCH)
+        converted_state_dict[new_key] = value
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+    return converted_state_dict
+
+
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+    pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(pytorch_path):
+        print("Loading checkpoint from PyTorch .bin file")
+        return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+    print("Loading checkpoint from safetensors directory")
+    return load_checkpoint_from_safetensors(input_dir)
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    print("Loading checkpoint...")
+    sd = load_checkpoint(input_dir)
+    print("Converting checkpoint...")
+    sd = gemma_to_executorch(sd)
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Gemma weights to ExecuTorch transformer format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index e0c9e4250a8..0d1728a0c6c 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -266,7 +266,7 @@ If you an error about "RE2 failed to compile pattern with lookahead:...SUPPORT_R
 
 **1. Build llama runner binary for Android**
 
-*Pre-requisite*: Android NDK (tested with r27b) which can be downloaded from [here](https://developer.android.com/ndk/downloads). Note that the mac binary can be unpackaged and you can locate NDK folder from it.
+*Pre-requisite*: Android NDK (tested with r28c) which can be downloaded from [here](https://developer.android.com/ndk/downloads). Note that the mac binary can be unpackaged and you can locate NDK folder from it.
 
 **1.1 Set Android NDK**
 ```
@@ -338,7 +338,7 @@ adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --t
 Please refer to [this tutorial](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/apple) to for full instructions on building the iOS etLLM Demo App.
 
 ### Android
-Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android) to for full instructions on building the Android LLAMA Demo App.
+Please refer to [this tutorial](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android) to for full instructions on building the Android LLAMA Demo App.
 
 ## Running with low-bit kernels
 
diff --git a/examples/models/llama/attention.py b/examples/models/llama/attention.py
index 6e3f7cb9fb2..0c0176269b3 100644
--- a/examples/models/llama/attention.py
+++ b/examples/models/llama/attention.py
@@ -516,3 +516,18 @@ def forward(
         output = self.wo(output)
 
         return output, None
+
+
+@register_attention("skip")
+class AttentionSkip(Attention):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        **kwargs: ForwardOptions,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        return x, None
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 45e08e1eb03..7fa9357f23b 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -421,6 +421,21 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Delegate more operators beyond DQLinear to the xnnpack backend. Requires -X or --xnnpack to be set.",
     )
+    parser.add_argument(
+        "--use-torchao-kernels",
+        action="store_true",
+        help="Delegate tied-embedding and quantized linear ops to torchao kernels",
+    )
+    parser.add_argument(
+        "--use-torchao-kernels-tied-embedding",
+        action="store_true",
+        help="Delegate tied-embedding ops to torchao kernels",
+    )
+    parser.add_argument(
+        "--use-torchao-kernels-linear",
+        action="store_true",
+        help="Delegate linear ops to torchao kernels",
+    )
     parser.add_argument("-V", "--vulkan", action="store_true")
     parser.add_argument("--vulkan-force-fp16", action="store_true")
     parser.add_argument("--mps", action="store_true")
@@ -753,6 +768,8 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
             preq_group_size=llm_config.base.preq_group_size,
             preq_embedding_quantize=llm_config.base.preq_embedding_quantize,
             local_global_attention=llm_config.model.local_global_attention,
+            use_torchao_kernels_linear=llm_config.backend.torchao.use_torchao_kernels_linear,
+            use_torchao_kernels_tied_embedding=llm_config.backend.torchao.use_torchao_kernels_tied_embedding,
         )
     )
 
@@ -1120,11 +1137,18 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
 
     if llm_config.backend.xnnpack.enabled:
         if llm_config.export.foundation_weights_file is not None:
-            gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
-                llm_config.export.foundation_weights_file
-                if "lora" not in x.name
-                else None
-            )
+            if llm_config.export.lora_weights_file is not None:
+                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                    llm_config.export.foundation_weights_file
+                    if "lora" not in x.name
+                    else None
+                )
+            else:
+                gen_tag_fn: Callable[[torch.fx.Node], Optional[str]] = lambda x: (
+                    llm_config.export.foundation_weights_file
+                    if "lora" not in x.name
+                    else llm_config.export.lora_weights_file
+                )
 
             from executorch.exir.passes.external_constants_pass import (
                 delegate_external_constants_pass_unlifted,
@@ -1278,12 +1302,15 @@ def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
-    model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
-        EagerModelFactory.create_model(
-            module_name,
-            model_class_name,
-            llm_config=llm_config,
-        )
+    (
+        model,
+        example_inputs,
+        example_kwarg_inputs,
+        dynamic_shapes,
+    ) = EagerModelFactory.create_model(
+        module_name,
+        model_class_name,
+        llm_config=llm_config,
     )
     # Convert dtype override string to actual type.
     dtype_override = DType[llm_config.model.dtype_override.value]
@@ -1360,6 +1387,9 @@ def _get_source_transforms(  # noqa
     preq_group_size: Optional[int] = None,
     preq_embedding_quantize: Optional[str] = None,
     local_global_attention: Optional[List[int]] = None,
+    use_torchao_kernels_linear: bool = False,
+    use_torchao_kernels_tied_embedding: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     """
     Return a list of functions that transform a graph.
@@ -1429,7 +1459,10 @@ def _get_source_transforms(  # noqa
         """
         transforms.append(
             get_quant_embedding_transform(
-                embedding_quantize, use_shared_embedding, checkpoint_dtype
+                embedding_quantize,
+                use_shared_embedding,
+                checkpoint_dtype,
+                quantize_with_hqq,
             )
         )
 
@@ -1460,6 +1493,7 @@ def _get_source_transforms(  # noqa
                 calibration_tasks=calibration_tasks,
                 calibration_limit=calibration_limit,
                 calibration_seq_length=calibration_seq_length,
+                quantize_with_hqq=quantize_with_hqq,
             )
         )
 
@@ -1532,6 +1566,17 @@ def _get_source_transforms(  # noqa
             )
         )
 
+    if any([use_torchao_kernels_linear, use_torchao_kernels_tied_embedding]):
+        from torchao.prototype.tensor_conversion.api import _convert_model_for_aarch64
+
+        transforms.append(
+            partial(
+                _convert_model_for_aarch64,
+                convert_linear=use_torchao_kernels_linear,
+                convert_tied_embedding=use_torchao_kernels_tied_embedding,
+            )
+        )
+
     return transforms
 
 
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 3a325d0f4f8..6587f7e1a10 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -15,6 +15,7 @@
 from executorch.examples.models.llama.attention import (
     Attention,
     ATTENTION_REGISTRY,
+    AttentionSkip,
     ForwardOptions,
 )
 from executorch.examples.models.llama.feed_forward import FeedForward
@@ -95,7 +96,10 @@ def __init__(self, args: ModelArgs, attention: Attention):
         else:
             self.feed_forward = FeedForward(dim=args.dim, hidden_dim=args.hidden_dim)
 
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        if isinstance(self.attention, AttentionSkip):
+            self.attention_norm = nn.Identity()
+        else:
+            self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
         self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
 
     @classmethod
@@ -120,8 +124,9 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x:
         h, attn_options_update = self.attention.forward(
             self.attention_norm(x), freqs_cos, freqs_sin, **attn_options
         )
+        if not isinstance(self.attention, AttentionSkip):
+            h = x + h
 
-        h = x + h
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
         else:
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
index 04d29f91ac6..3f9d3d8f2af 100644
--- a/examples/models/llama/model_args.py
+++ b/examples/models/llama/model_args.py
@@ -63,6 +63,9 @@ class ModelArgs:
     use_sdpa_with_kv_cache_op: bool = (
         False  # Use custom sdpa op that updates kv cache in-place
     )
+    # Device to use for the model: "cpu" or "cuda" (needed for QAT)
+    # Only used for creating Rope parameters
+    device: str = "cpu"
     # Generate logits for all inputs. When it's True, it would take big memory usage
     # at runtime. Enable it only necessary (e.g., use perplexity tools that requires
     # logits for all input tokens.)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 8c0d5db6a80..ea4e6b37243 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -138,7 +138,11 @@ def forward(
 # and https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L242.
 # Current only support non-long rope.
 def hf_precompute_freqs_cis(
-    dim: int, end: int, theta: float, partial_rotary_factor: float = 1.0
+    dim: int,
+    end: int,
+    theta: float,
+    partial_rotary_factor: float = 1.0,
+    device: Union[str, torch.device] = "cpu",
 ):
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
@@ -146,7 +150,7 @@ def hf_precompute_freqs_cis(
     # Short factor scaling.
     freqs = 1.0 / (
         theta
-        ** (torch.arange(0, dim, 2, device="cpu", dtype=torch.int64).float() / dim)
+        ** (torch.arange(0, dim, 2, device=device, dtype=torch.int64).float() / dim)
     )
     # TODO: support long factor scaling.
 
@@ -236,6 +240,7 @@ def __init__(self, params: ModelArgs):
             self.precompute_freqs_cis = partial(
                 hf_precompute_freqs_cis,
                 partial_rotary_factor=self.params.partial_rotary_factor,
+                device=getattr(self.params, "device", "cpu"),
             )
             self.apply_rotary_emb = hf_apply_rotary_emb
         else:
@@ -244,6 +249,7 @@ def __init__(self, params: ModelArgs):
                 use_scaled=self.params.use_scaled_rope,
                 scale_factor=self.params.rope_scale_factor,
                 high_freq_factor=self.params.high_freq_factor,
+                device=getattr(self.params, "device", "cpu"),
             )
             self.apply_rotary_emb = RotaryEmbedding()
 
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
index 2ba2fdf9941..19ed9f88339 100644
--- a/examples/models/llama/runner/runner.cpp
+++ b/examples/models/llama/runner/runner.cpp
@@ -37,6 +37,21 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
     float temperature) {
+  if (data_path.has_value()) {
+    std::vector<std::string> data_files;
+    data_files.push_back(data_path.value());
+    return create_llama_runner(
+        model_path, tokenizer_path, std::move(data_files), temperature);
+  }
+  return create_llama_runner(
+      model_path, tokenizer_path, std::vector<std::string>(), temperature);
+}
+
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::vector<std::string> data_files,
+    float temperature) {
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -55,7 +70,7 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     return nullptr;
   }
   return llm::create_text_llm_runner(
-      model_path, std::move(tokenizer), data_path);
+      model_path, std::move(tokenizer), data_files);
 }
 
 } // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
index f07cd4e8ee8..728ae57efa8 100644
--- a/examples/models/llama/runner/runner.h
+++ b/examples/models/llama/runner/runner.h
@@ -11,12 +11,9 @@
 
 #pragma once
 
-#include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/runner/irunner.h>
@@ -30,7 +27,13 @@ namespace llm = ::executorch::extension::llm;
 std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    std::optional<const std::string> data_path = std::nullopt,
+    std::optional<const std::string> data_path,
+    float temperature = -1.0f);
+
+std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
+    const std::string& model_path,
+    const std::string& tokenizer_path,
+    std::vector<std::string> data_files = {},
     float temperature = -1.0f);
 
 std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index db7add8d16a..06fbffbef83 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -434,6 +434,7 @@ class StaticAttentionIOManager {
     std::vector<size_t> k_cache_output_indices;
     std::vector<size_t> v_cache_input_indices;
     std::vector<size_t> v_cache_output_indices;
+    size_t max_context_len{};
     RopeT* rope_freqs_cos;
     RopeT* rope_freqs_sin;
     StaticAttentionUpdateStyle style = StaticAttentionUpdateStyle::SMART_MASK;
@@ -585,12 +586,12 @@ class StaticAttentionIOManager {
    * of the prompt and method's input length. Returns the position in the output
    * that corresponds to the end of the prompt during the last inference.
    */
-  template <typename TokenT>
+  template <typename TokenT, typename LogitT>
   size_t prefill(
       executorch::runtime::Span<TokenT> tokens,
       executorch::runtime::Span<TokenT> input_buffer,
       executorch::runtime::Method& method,
-      std::function<void(executorch::runtime::Span<const float>)>
+      std::function<void(executorch::runtime::Span<const LogitT>)>
           logits_callback = nullptr) {
     ET_LOG(Info, "Prefilling at position %zu", input_pos_);
     size_t input_len = input_buffer.size();
@@ -604,6 +605,10 @@ class StaticAttentionIOManager {
     size_t batch_len = 0;
     for (size_t i = 0; i < tokens.size(); i += input_len) {
       batch_len = std::min(input_len, tokens.size() - i);
+      if (input_pos_ + batch_len > config_.max_context_len) {
+        ET_LOG(Error, "Maximum context size reached, stopping prefill.");
+        return input_len - 1;
+      }
       std::copy(&tokens[i], &tokens[i + batch_len], input_buffer.begin());
       prepare(method);
       ET_CHECK(method.execute() == executorch::runtime::Error::Ok);
@@ -614,7 +619,7 @@ class StaticAttentionIOManager {
           batch_len);
       if (logits_callback) {
         auto logits_tensor = method.get_output(0).toTensor();
-        auto* logits = logits_tensor.const_data_ptr<float>();
+        auto* logits = logits_tensor.const_data_ptr<LogitT>();
         logits_callback(executorch::runtime::Span(
             logits,
             logits + batch_len * logits_tensor.size(logits_tensor.dim() - 1)));
@@ -646,6 +651,10 @@ class StaticAttentionIOManager {
 
     while (true) {
       input_buffer[0] = prev_tok;
+      if (input_pos_ + 1 > config_.max_context_len) {
+        ET_LOG(Error, "Maximum context size reached, stopping decode.");
+        break;
+      }
       prepare(method);
       ET_CHECK(method.execute() == executorch::runtime::Error::Ok);
       update(
@@ -730,6 +739,11 @@ class StaticAttentionIOManager {
       }
 
       // Setup input pointers and RoPE frequencies.
+      if (input_pos_ + ngram_size > config_.max_context_len) {
+        ET_LOG(
+            Error, "Maximum context size reached, stopping lookahead decode.");
+        break;
+      }
       prepare(
           method,
           executorch::runtime::Span(pos_offsets.data(), pos_offsets.size()));
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 8b76b7650fe..9e49f9e4e15 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -49,6 +49,7 @@ def quantize(  # noqa C901
     blocksize: int = 128,
     tokenizer_path: Optional[Path] = None,
     verbose: bool = False,
+    quantize_with_hqq: bool = True,
 ) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
@@ -119,7 +120,6 @@ def quantize(  # noqa C901
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
-            MappingType,
             quantize_,
         )
         from torchao.utils import unwrap_tensor_subclass
@@ -134,9 +134,12 @@ def quantize(  # noqa C901
                     weight_granularity=(
                         PerAxis(0) if group_size == 0 else PerGroup(group_size)
                     ),
-                    weight_mapping_type=MappingType.SYMMETRIC,
                     # pyre-ignore[6]
                     intx_packing_format="opaque_torchao_auto",
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if quantize_with_hqq else "affine"
+                    ),
                 ),
             )
             model = unwrap_tensor_subclass(model)
@@ -170,6 +173,10 @@ def filter_fn(m, fqn):
                 # pyre-ignore[16]
                 weight_dtype=torch.int4,
                 weight_granularity=PerGroup(group_size),
+                # pyre-ignore[6]
+                intx_choose_qparams_algorithm=(
+                    "hqq_scale_only" if quantize_with_hqq else "affine"
+                ),
             ),
             filter_fn=filter_fn,
         )
@@ -191,6 +198,10 @@ def filter_fn(m, fqn):
             # pyre-ignore[16]
             weight_dtype=torch.int4,
             granularity=PerGroup(q_group_size),
+            # pyre-ignore[6]
+            intx_choose_qparams_algorithm=(
+                "hqq_scale_only" if quantize_with_hqq else "affine"
+            ),
         )
         quantize_(model, q_config)
         model = unwrap_tensor_subclass(model)
@@ -580,6 +591,7 @@ def __init__(
         group_size: Optional[int] = None,
         packed=False,
         precision: Optional[torch.dtype] = None,
+        quantize_with_hqq: bool = True,
     ):
         if isinstance(packed, str):
             packed = packed == "True"
@@ -592,22 +604,16 @@ def __init__(
         self.precision = precision
         if (bitwidth not in [2, 4]) and packed:
             raise RuntimeError("pack only works with bitsize 2, 4")
+        self.quantize_with_hqq = quantize_with_hqq
 
     @torch.no_grad()
     def create_quantized_state_dict(self, packed=False) -> Dict:
+        from torchao.quantization.granularity import PerAxis, PerGroup
+        from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
+
         cur_state_dict = self.mod.state_dict()
 
-        if self.bitwidth == 2:
-            range_min = -2
-            range_max = 1
-        elif self.bitwidth == 4:
-            range_min = -8
-            range_max = 7
-        elif self.bitwidth == 8:
-            range_min = -128
-            range_max = 127
-        else:
-            raise ValueError(f"Unsupported bitwidth {self.bitwidth}")
+        assert self.bitwidth in [2, 4, 8], f"Unsupported bitwidth {self.bitwidth}"
 
         for fqn, mod in self.mod.named_modules():
             if isinstance(mod, nn.Embedding):
@@ -619,18 +625,25 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                 print(
                     f"quantize {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"
                 )
-                weight, scales, _ = dynamically_quantize_per_channel(
-                    (
-                        mod.weight.to(dtype=self.precision)
-                        if self.precision
-                        else mod.weight
+                tmp_model = nn.Embedding(mod.weight.shape[0], mod.weight.shape[1])
+                if self.precision:
+                    tmp_model = tmp_model.to(dtype=self.precision)
+                tmp_model.weight = nn.Parameter(mod.weight)
+                config = IntxWeightOnlyConfig(
+                    weight_dtype=getattr(torch, f"int{self.bitwidth}"),
+                    granularity=(
+                        PerAxis(0)
+                        if (self.group_size is None or self.group_size == 0)
+                        else PerGroup(self.group_size)
+                    ),
+                    # pyre-ignore[6]
+                    intx_choose_qparams_algorithm=(
+                        "hqq_scale_only" if self.quantize_with_hqq else "affine"
                     ),
-                    range_min,
-                    range_max,
-                    torch.int8,
-                    self.group_size,
-                    scales_dtype=mod.weight.dtype,
                 )
+                quantize_(tmp_model, config, lambda m, fqn: isinstance(m, nn.Embedding))
+                weight = tmp_model.weight.qdata  # pyre-ignore[16]
+                scales = tmp_model.weight.scale  # pyre-ignore[16]
 
                 if packed:
                     if self.bitwidth == 2:
@@ -764,6 +777,7 @@ def get_quant_embedding_transform(
     embedding_quantize: str,
     use_shared_embedding: bool = False,
     dtype_override: Optional[DType] = None,
+    quantize_with_hqq: bool = True,
 ):
     if embedding_quantize.startswith("torchao:"):
         from torchao.prototype.quantization.embedding.api import (
@@ -824,6 +838,7 @@ def _torchao_embedding_quantizer(model):
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
         precision=torch_dtype,
+        quantize_with_hqq=quantize_with_hqq,
     ).quantized_model()
 
 
@@ -837,6 +852,7 @@ def get_quant_weight_transform(
     calibration_tasks: Optional[list] = None,
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
+    quantize_with_hqq: bool = True,
 ):
     return partial(
         quantize,
@@ -849,6 +865,7 @@ def get_quant_weight_transform(
         calibration_limit=calibration_limit,
         calibration_seq_length=calibration_seq_length,
         tokenizer_path=(Path(path) if (path := tokenizer_path) is not None else None),
+        quantize_with_hqq=quantize_with_hqq,
     )
 
 
@@ -876,7 +893,6 @@ def _load_torchao_aten_lib(libname):
 def set_8da4w_computation_dtype(
     module: nn.Module, computation_dtype: torch.dtype
 ) -> nn.Module:
-
     from torchao.quantization.linear_quant_modules import Int8DynActInt4WeightLinear
 
     def _set_8da4w_computation_dtype(module: nn.Module, dtype: torch.dtype) -> None:
diff --git a/examples/models/llama/static_attention.py b/examples/models/llama/static_attention.py
index b42371dc090..95bae1b766a 100644
--- a/examples/models/llama/static_attention.py
+++ b/examples/models/llama/static_attention.py
@@ -259,7 +259,7 @@ def __init__(
         }
 
         rope = Rope(config)
-        freqs = rope.get_freqs(None, config.max_seq_len)
+        freqs = rope.get_freqs(None, config.max_context_len)
         self.freqs_cos = freqs[0].to(dtype)
         self.freqs_sin = freqs[1].to(dtype)
 
diff --git a/examples/models/llava/README.md b/examples/models/llava/README.md
index e7b8ba523fd..41ad8138a97 100644
--- a/examples/models/llava/README.md
+++ b/examples/models/llava/README.md
@@ -48,7 +48,7 @@ Prerequisite: run `install_executorch.sh` to install ExecuTorch and run
 `examples/models/llava/install_requirements.sh` to install dependencies.
 
 ```bash
-python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+python -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts --max-context-len=768
 ```
 
 Currently the whole export process takes about 6 minutes. We also provide a
@@ -81,7 +81,7 @@ To run the Android/iOS apps, you need a device with at least 12GiB memory.
 #### Android
 
 We can run LLAVA using the LLAMA Demo Apps. Please refer to [this
-tutorial](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo)
+tutorial](https://github.com/meta-pytorch/executorch-examples/tree/main/llm/android/LlamaDemo)
 to for full instructions on building the Android LLAMA Demo App.
 
 #### iOS
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
index 7e571087c1d..34991c91089 100644
--- a/examples/models/llava/export_llava.py
+++ b/examples/models/llava/export_llava.py
@@ -224,12 +224,12 @@ def export_all(llava_model: LlavaModel):
 
     lowered_and_edge = to_edge_transform_and_lower(
         {
-            "image_encoder": image_encoder_ep,
+            "vision_encoder": image_encoder_ep,
             "token_embedding": token_embedding_ep,
             "text_decoder": text_model_ep,
         },
         partitioner={
-            "image_encoder": [XnnpackPartitioner()],
+            "vision_encoder": [XnnpackPartitioner()],
             "text_decoder": [
                 # First partition the DQLinear nodes, then partition the rest of the nodes,
                 # to avoid multiple DQLinear nodes in the same partition,
@@ -254,7 +254,7 @@ def export_all(llava_model: LlavaModel):
             ],
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
-                "image_encoder": ConstraintBasedSymShapeEvalPass(),
+                "vision_encoder": ConstraintBasedSymShapeEvalPass(),
                 "text_decoder": ConstraintBasedSymShapeEvalPass(),
                 "token_embedding": HintBasedSymShapeEvalPass(),
             },
@@ -281,6 +281,7 @@ def create_llava_config_from_args(args):
     llm_config = LlmConfig()
 
     llm_config.model.use_sdpa_with_kv_cache = args.use_sdpa_with_kv_cache
+    llm_config.export.max_context_length = args.max_context_len
     llm_config.export.max_seq_length = args.max_seq_len
     llm_config.export.output_name = args.pte_name
     llm_config.debug.profile_memory = args.profile_memory
@@ -296,6 +297,12 @@ def main():
         action=BooleanOptionalAction,
         help="Use sdpa_with_kv_cache custom op in LLava text model.",
     )
+    parser.add_argument(
+        "--max-context-len",
+        required=True,
+        type=int,
+        help="Maximum context length for the text model.",
+    )
     parser.add_argument(
         "--max-seq-len",
         default=768,
@@ -325,12 +332,13 @@ def main():
     llm_config = create_llava_config_from_args(args)
 
     logging.info(
-        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {llm_config.model.use_sdpa_with_kv_cache}, max_seq_len: {llm_config.export.max_seq_length}"
+        f"Exporting Llava model to ExecuTorch with sdpa_with_kv_cache: {llm_config.model.use_sdpa_with_kv_cache}, max_seq_len: {llm_config.export.max_seq_length}, max_context_len: {llm_config.export.max_context_length}"
     )
 
     llava_model = LlavaModel(
         use_sdpa_with_kv_cache_op=llm_config.model.use_sdpa_with_kv_cache,
         max_seq_len=llm_config.export.max_seq_length,
+        max_context_len=llm_config.export.max_context_length,
     )
 
     executorch_program = export_all(llava_model)
diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh
index 4dcdeea83bf..9dfccf11600 100755
--- a/examples/models/llava/install_requirements.sh
+++ b/examples/models/llava/install_requirements.sh
@@ -7,9 +7,4 @@
 
 set -x
 
-pip install transformers accelerate sentencepiece tiktoken
-
-# Run llama2/install requirements for torchao deps
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-
-bash "$SCRIPT_DIR"/../llama/install_requirements.sh
+pip install git+https://github.com/huggingface/optimum-executorch.git@d4d3046738ca31b5542506aaa76a28d540600227
diff --git a/examples/models/llava/main.cpp b/examples/models/llava/main.cpp
index 6cb84aa088e..635fd7888d2 100644
--- a/examples/models/llava/main.cpp
+++ b/examples/models/llava/main.cpp
@@ -81,24 +81,20 @@ void load_image(const std::string& image_path, Image& image) {
       new_height,
       0,
       channels);
-  // transpose to CHW
-  image.data.resize(channels * new_width * new_height);
+  std::vector<uint8_t> chw_data(channels * new_width * new_height);
   for (int i = 0; i < new_width * new_height; ++i) {
     for (int c = 0; c < channels; ++c) {
-      image.data[c * new_width * new_height + i] =
-          resized_data[i * channels + c];
+      chw_data[c * new_width * new_height + i] = resized_data[i * channels + c];
     }
   }
-  image.width = new_width;
-  image.height = new_height;
-  image.channels = channels;
+  image = Image(std::move(chw_data), new_width, new_height, channels);
   // convert to tensor
   ET_LOG(
       Info,
       "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
-      image.channels,
-      image.height,
-      image.width);
+      image.channels(),
+      image.height(),
+      image.width());
   stbi_image_free(data);
 }
 
@@ -135,8 +131,7 @@ int32_t main(int32_t argc, char** argv) {
 #endif
   // Load tokenizer
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
-      std::make_unique<tokenizers::Llama2cTokenizer>();
-  tokenizer->load(tokenizer_path);
+      ::executorch::extension::llm::load_tokenizer(tokenizer_path);
   if (tokenizer == nullptr) {
     ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
     return 1;
diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py
index 9ff56124174..93005069609 100644
--- a/examples/models/llava/model.py
+++ b/examples/models/llava/model.py
@@ -66,6 +66,7 @@ def __init__(
         llava_model: LlavaForConditionalGeneration,
         image_processor: CLIPImageProcessor,
         use_sdpa_with_kv_cache_op: bool = True,
+        max_context_len: int = 768,
         max_seq_len: int = 768,
     ):
         super().__init__()
@@ -87,6 +88,7 @@ def __init__(
             enable_dynamic_shape=True,  # allow parallel prefill
             use_sdpa_with_kv_cache_op=use_sdpa_with_kv_cache_op,  # use sdpa_with_kv_cache op
             use_hf_rope=True,
+            max_context_len=max_context_len,
             max_seq_len=max_seq_len,
         )
         self.text_model = construct_transformer(self.text_model_args)
@@ -300,8 +302,11 @@ def forward(
 
 
 class LlavaModel(EagerModelBase):
-    def __init__(self, use_sdpa_with_kv_cache_op=True, max_seq_len=768):
+    def __init__(
+        self, use_sdpa_with_kv_cache_op=True, max_seq_len=768, max_context_len=768
+    ):
         self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op
+        self.max_context_len = max_context_len
         self.max_seq_len = max_seq_len
         self.model = LlavaForConditionalGeneration.from_pretrained(
             "llava-hf/llava-1.5-7b-hf",
@@ -348,6 +353,7 @@ def get_eager_model(self):
             self.model,
             self.image_processor,
             self.use_sdpa_with_kv_cache_op,
+            self.max_context_len,
             self.max_seq_len,
         )
         model.to(dtype=torch.float32)
diff --git a/examples/models/llava/test/test_llava.py b/examples/models/llava/test/test_llava.py
index 7f2b59e0116..1708cdcd516 100644
--- a/examples/models/llava/test/test_llava.py
+++ b/examples/models/llava/test/test_llava.py
@@ -105,7 +105,7 @@ def test_llava_export(self):
         start_pos += pte_embeds_before_img.shape[1]
 
         # pte prefill image
-        pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+        pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
         llava_module.run_method(
             "text_decoder",
             (
diff --git a/examples/models/llava/test/test_pte.py b/examples/models/llava/test/test_pte.py
index 1f4aaa9938c..4b924aed680 100644
--- a/examples/models/llava/test/test_pte.py
+++ b/examples/models/llava/test/test_pte.py
@@ -56,7 +56,7 @@ def main():
 
     # pte prefill image
     logging.warning("Image encoder started")
-    pte_embeds_img = llava_module.run_method("image_encoder", (resized,))[0]
+    pte_embeds_img = llava_module.run_method("vision_encoder", (resized,))[0]
     logging.warning("Image encoder finished")
     logging.warning("Image token prefill started")
     pte_prefill_img = llava_module.run_method(
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index cfe691c7bd4..6df4caf8692 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -8,7 +8,7 @@
 set -x
 
 conda install -c conda-forge "ffmpeg<8" -y
-pip install torchcodec==0.7.0.dev20250906 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+pip install torchcodec==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 pip install moshi==0.2.4
 pip install bitsandbytes soundfile
 # Run llama2/install requirements for torchao deps
diff --git a/examples/models/moshi/mimi/test_mimi.py b/examples/models/moshi/mimi/test_mimi.py
index be3c075913d..d0c3c2ceb15 100644
--- a/examples/models/moshi/mimi/test_mimi.py
+++ b/examples/models/moshi/mimi/test_mimi.py
@@ -156,7 +156,7 @@ def test_streaming_encoding_decoding(self):
         all_pcms_streaming = torch.cat(all_pcms_streaming, dim=-1)
         sqnr_streaming = compute_sqnr(pcm_ref, all_pcms_streaming)
         print(f"sqnr_streaming = {sqnr_streaming} dB")
-        self.assertTrue(sqnr_streaming > 100)
+        self.assertTrue(sqnr_streaming > 70)
 
     def test_exported_encoding(self):
         """Ensure exported encoding model is consistent with reference output."""
diff --git a/examples/models/voxtral/CMakeLists.txt b/examples/models/voxtral/CMakeLists.txt
index 85c6a13e0ff..3995f5533e6 100644
--- a/examples/models/voxtral/CMakeLists.txt
+++ b/examples/models/voxtral/CMakeLists.txt
@@ -86,6 +86,13 @@ list(
   extension_flat_tensor
 )
 
+# Link CUDA backend
+if(EXECUTORCH_BUILD_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  list(APPEND link_libraries aoti_cuda)
+  executorch_target_link_options_shared_lib(aoti_cuda)
+endif()
+
 # Add tokenizers
 list(APPEND link_libraries tokenizers::tokenizers)
 
diff --git a/examples/models/voxtral/README.md b/examples/models/voxtral/README.md
index 0e7a095af45..861043fe2a7 100644
--- a/examples/models/voxtral/README.md
+++ b/examples/models/voxtral/README.md
@@ -27,24 +27,49 @@ optimum-cli export executorch \
   --recipe "xnnpack" \
   --use_custom_sdpa \
   --use_custom_kv_cache \
+  --max_seq_len 2048 \
   --qlinear 8da4w \
+  --qlinear_encoder 8da4w \
   --qembedding 4w \
   --output_dir="voxtral"
 ```
 
 This exports Voxtral with XNNPack backend acceleration and 4-bit weight/8-bit activation linear quantization.
 
+## CUDA Support
+If your environment has CUDA support, you can enable the runner to run on CUDA for improved performance. Follow the export and runtime commands below:
+
+**Note:** We are currently working on quantization support for CUDA. Currently, only bfloat16 dtype is supported for CUDA execution.
+
+### Exporting with CUDA
+```
+optimum-cli export executorch \
+  --model "mistralai/Voxtral-Mini-3B-2507" \
+  --task "multimodal-text-to-text" \
+  --recipe "cuda" \
+  --dtype bfloat16 \
+  --device cuda \
+  --max_seq_len 1024 \
+  --output_dir="voxtral"
+```
+
+This will generate:
+- `model.pte` - The exported model
+- `aoti_cuda_blob.ptd` - The CUDA kernel blob required for runtime
+
+See the "Building the multimodal runner" section below for instructions on building with CUDA support, and the "Running the model" section for runtime instructions.
+
 # Running the model
 To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's MultiModal runner API.
 The Voxtral runner will do the following things:
 
 - Audio Input:
-  - Option A:  Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor.
-  - Option B:  If starting directly with an already processed audio input tensor, format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
+   - Option A:  Pass raw audio data from a `.wav` file into the exported preprocessor to produce a mel spectrogram tensor.
+   - Option B:  If starting directly with an already processed audio input tensor (preprocessed mel spectrogram), format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
 - Feed the formatted inputs to the multimodal modal runner.
 
 
-# [Option A] Exporting the audio preprocessor
+## Exporting the audio preprocessor
 The exported model takes in a mel spectrogram input tensor as its audio inputs.
 We provide a simple way to transform raw audio data into a mel spectrogram by exporting a version of Voxtral's audio preprocessor used directly by Transformers.
 
@@ -54,6 +79,8 @@ python -m executorch.extension.audio.mel_spectrogram --feature_size 128 --stack_
 ```
 
 ## Building the multimodal runner
+
+### Building for CPU (XNNPack)
 ```
 # Build and install ExecuTorch
 cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -DEXECUTORCH_ENABLE_LOGGING=ON && cmake --build cmake-out -j16 --target install --config Release
@@ -62,15 +89,54 @@ cmake --preset llm -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=cmake-out -
 cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Release -Bcmake-out/examples/models/voxtral examples/models/voxtral && cmake --build cmake-out/examples/models/voxtral -j16 --config Release
 ```
 
+### Building for CUDA
+```
+# Install ExecuTorch with CUDA support
+CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+
+# Build the multimodal runner with CUDA
+cmake --preset llm \
+      -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_INSTALL_PREFIX=cmake-out \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Bcmake-out -S.
+cmake --build cmake-out -j16 --target install --config Release
+
+cmake -DEXECUTORCH_BUILD_CUDA=ON \
+      -DCMAKE_BUILD_TYPE=Release \
+      -Sexamples/models/voxtral \
+      -Bcmake-out/examples/models/voxtral/
+cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+```
+
 ## Running the model
 You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
+
+### Running with raw audio (.wav file)
+For raw audio files (`.wav`), you must provide a preprocessor to convert the audio into mel spectrogram format:
+```
+./cmake-out/examples/models/voxtral/voxtral_runner \
+  --model_path path/to/model.pte \
+  --tokenizer_path path/to/tekken.json \
+  --prompt "What can you tell me about this audio?" \
+  --audio_path path/to/audio_input.wav \
+  --processor_path path/to/voxtral_preprocessor.pte
+```
+
+### Running with preprocessed audio (.bin file)
+If you already have a preprocessed mel spectrogram saved as a `.bin` file, you can skip the preprocessor:
 ```
 ./cmake-out/examples/models/voxtral/voxtral_runner \
   --model_path path/to/model.pte \
   --tokenizer_path path/to/tekken.json \
   --prompt "What can you tell me about this audio?" \
-  --audio_path path/to/audio_input.bin \
-  --processor_path path/to/voxtral_preprocessor.pte # If you're passing raw audio file in audio_path
+  --audio_path path/to/preprocessed_audio.bin
+```
+
+
+**For CUDA:** Add the `--data_path` argument to provide the CUDA kernel blob to the commands above:
+```
+  --data_path path/to/aoti_cuda_blob.ptd
 ```
 
 Example output:
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
index 17013df96e1..b3dd5e3ab68 100644
--- a/examples/models/voxtral/multimodal.cpp
+++ b/examples/models/voxtral/multimodal.cpp
@@ -21,6 +21,7 @@
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
 #include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/wav_loader.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
@@ -34,6 +35,7 @@ DEFINE_string(
     "multimodal.pte",
     "Model serialized in flatbuffer format.");
 
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
@@ -103,27 +105,25 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 
   ET_LOG(Info, "audio_data len = %zu", n_floats);
 
-  // Create Audio multimodal input
-  auto audio = std::make_unique<::executorch::extension::llm::Audio>();
-  audio->batch_size = batch_size;
-  audio->n_bins = n_bins;
-  audio->n_frames = n_frames;
-  audio->data.resize(n_floats * sizeof(float));
-  f.read(reinterpret_cast<char*>(audio->data.data()), n_floats * sizeof(float));
+  std::vector<float> audio_data(n_floats);
+  f.read(reinterpret_cast<char*>(audio_data.data()), n_floats * sizeof(float));
   f.close();
-  return ::executorch::extension::llm::make_audio_input(std::move(*audio));
+
+  auto audio = ::executorch::extension::llm::Audio(
+      std::move(audio_data), batch_size, n_bins, n_frames);
+  return ::executorch::extension::llm::make_audio_input(std::move(audio));
 }
 
 /**
- * @brief Loads a .bin file into a tensor and processes it using a .pte
- * processor
+ * @brief Loads raw audio from a .bin or .wav file and processes it using a
+ * .pte processor
  *
- * This function loads raw audio data from a .bin file (similar to
- * loadPreprocessedAudio), creates a tensor from it, and then passes it through
- * a processor module loaded from a .pte file to generate processed audio
- * features.
+ * This function loads raw audio data from either a .bin file (raw float array)
+ * or a .wav file (WAV format with headers), creates a tensor from it, and then
+ * passes it through a processor module loaded from a .pte file to generate
+ * processed audio features.
  *
- * @param audio_path Path to the .bin audio file
+ * @param audio_path Path to the .bin or .wav audio file
  * @param processor_path Path to the .pte processor file
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file loading or processing fails
@@ -137,6 +137,41 @@ MultimodalInput processRawAudioFile(
         "Processor path is required for raw audio processing");
   }
 
+  // Load the audio data from file (.bin or .wav)
+  std::vector<float> audio_data;
+  if (ends_with(audio_path, ".wav")) {
+    audio_data = ::executorch::extension::llm::load_wav_audio_data(audio_path);
+    ET_LOG(
+        Info,
+        "Loaded WAV file: %s, %zu samples",
+        audio_path.c_str(),
+        audio_data.size());
+  } else if (ends_with(audio_path, ".bin")) {
+    std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+    if (!f.is_open()) {
+      ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+      throw std::runtime_error("Failed to open audio file");
+    }
+
+    std::size_t n_floats = f.tellg() / sizeof(float);
+    f.seekg(0, std::ios::beg);
+
+    audio_data.resize(n_floats);
+    f.read(
+        reinterpret_cast<char*>(audio_data.data()),
+        audio_data.size() * sizeof(float));
+    f.close();
+
+    ET_LOG(
+        Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  } else {
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
+        audio_path.c_str());
+    throw std::runtime_error("Unsupported audio file format");
+  }
+
   // Load the audio processor .pte.
   std::unique_ptr<Module> processor_module;
   try {
@@ -155,25 +190,6 @@ MultimodalInput processRawAudioFile(
     throw std::runtime_error("Exception while loading processor module");
   }
 
-  // Load the audio data from file.
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
-  if (!f.is_open()) {
-    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
-    throw std::runtime_error("Failed to open audio file");
-  }
-
-  std::size_t n_floats = f.tellg() / sizeof(float);
-  f.seekg(0, std::ios::beg);
-
-  std::vector<float> audio_data(n_floats);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
-  f.close();
-
-  ET_LOG(
-      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
-
   // Execute the processor
   std::vector<executorch::aten::SizesType> tensor_shape = {
       static_cast<executorch::aten::SizesType>(audio_data.size())};
@@ -206,32 +222,21 @@ MultimodalInput processRawAudioFile(
       static_cast<int>(sizes[2]));
 
   // Create Audio multimodal input from processed features
-  auto processed_audio =
-      std::make_unique<::executorch::extension::llm::Audio>();
-  processed_audio->batch_size =
-      static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
-                                      // yet, so this will just be = 1.
-  processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
-  processed_audio->n_frames =
-      static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
-
-  size_t total_elements = processed_audio->batch_size *
-      processed_audio->n_bins * processed_audio->n_frames;
-  processed_audio->data.resize(total_elements * sizeof(float));
-  std::memcpy(
-      processed_audio->data.data(),
-      processed_data,
-      total_elements * sizeof(float));
-
+  int32_t batch_size = static_cast<int32_t>(sizes[0]);
+  int32_t n_bins = static_cast<int32_t>(sizes[1]);
+  int32_t n_frames = static_cast<int32_t>(sizes[2]);
+  size_t total_elements = batch_size * n_bins * n_frames;
+  std::vector<float> audio_vec(processed_data, processed_data + total_elements);
+  auto processed_audio = ::executorch::extension::llm::Audio(
+      std::move(audio_vec), batch_size, n_bins, n_frames);
   ET_LOG(
       Info,
       "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
-      processed_audio->batch_size,
-      processed_audio->n_bins,
-      processed_audio->n_frames);
-
+      batch_size,
+      n_bins,
+      n_frames);
   return ::executorch::extension::llm::make_audio_input(
-      std::move(*processed_audio));
+      std::move(processed_audio));
 }
 
 /**
@@ -239,33 +244,39 @@ MultimodalInput processRawAudioFile(
  *
  * Dispatches audio file processing based on file extension and processor
  * availability:
+ * - .wav files: Requires processor, processes raw audio through processor
  * - .bin files with processor: Loads raw audio from .bin and processes through
  * processor
  * - .bin files without processor: Loads preprocessed mel spectrogram features
  * directly
  *
- * @param audio_path Path to the audio file (.bin)
- * @param processor_path Path to the processor .pte file (optional)
+ * @param audio_path Path to the audio file (.bin or .wav)
+ * @param processor_path Path to the processor .pte file (optional for .bin,
+ * required for .wav)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
 MultimodalInput processAudioFile(
     const std::string& audio_path,
     const std::string& processor_path = "") {
-  if (ends_with(audio_path, ".bin")) {
-    if (!processor_path.empty()) {
-      // Process raw audio from .bin file through the processor
-      return processRawAudioFile(audio_path, processor_path);
-    } else {
-      // Load preprocessed audio stored as a binary file (existing behavior)
-      return loadPreprocessedAudio(audio_path);
+  if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".bin")) {
+    if (processor_path.empty()) {
+      if (ends_with(audio_path, ".wav")) {
+        ET_CHECK_MSG(
+            false,
+            "Processor path is required for .wav file processing: %s",
+            audio_path.c_str());
+      } else {
+        // Load preprocessed audio stored as a binary file (existing behavior)
+        return loadPreprocessedAudio(audio_path);
+      }
     }
+    return processRawAudioFile(audio_path, processor_path);
   } else {
-    ET_LOG(
-        Error,
-        "Unsupported audio file format: %s (only .bin files are supported)",
+    ET_CHECK_MSG(
+        false,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
         audio_path.c_str());
-    throw std::runtime_error("Unsupported audio file format");
   }
 }
 
@@ -280,6 +291,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
   const char* processor_path = FLAGS_processor_path.c_str();
+  const char* data_path = FLAGS_data_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -307,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
   // Create multimodal runner
   std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
       ::executorch::extension::llm::create_multimodal_runner(
-          model_path, std::move(tokenizer));
+          model_path, std::move(tokenizer), data_path);
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create multimodal runner");
     return 1;
diff --git a/examples/models/yolo12/.gitignore b/examples/models/yolo12/.gitignore
new file mode 100644
index 00000000000..02deda29710
--- /dev/null
+++ b/examples/models/yolo12/.gitignore
@@ -0,0 +1,3 @@
+*.pt
+*.pte
+*.ptd
diff --git a/examples/models/yolo12/README.md b/examples/models/yolo12/README.md
index 2260afa5dde..1a54f1a4a16 100644
--- a/examples/models/yolo12/README.md
+++ b/examples/models/yolo12/README.md
@@ -1,10 +1,11 @@
 # YOLO12 Detection C++ Inference with ExecuTorch
 
-This example demonstrates how to perform inference of [Ultralytics YOLO12 family](https://docs.ultralytics.com/models/yolo12/) detection models in C++ leveraging the Executorch backends:
+This example demonstrates how to perform inference of [YOLO12 family](https://docs.ultralytics.com/models/yolo12/) detection models in C++ leveraging the Executorch backends:
+
 - [OpenVINO](../../../backends/openvino/README.md)
 - [XNNPACK](../../../backends/xnnpack/README.md)
 
-# Performance Evaluation
+## Performance Evaluation
 
 | CPU                            | Model   | Backend  | Device | Precision | Average Latency, ms |
 |--------------------------------|---------|----------|--------|-----------|---------------------|
@@ -17,8 +18,7 @@ This example demonstrates how to perform inference of [Ultralytics YOLO12 family
 | Intel(R) Core(TM) Ultra 7 155H | yolo12s | xnnpack  | CPU    | FP32      | 169.36              |
 | Intel(R) Core(TM) Ultra 7 155H | yolo12l | xnnpack  | CPU    | FP32      | 436.876             |
 
-
-# Instructions
+## Instructions
 
 ### Step 1: Install ExecuTorch
 
@@ -31,35 +31,36 @@ To install ExecuTorch, follow this [guide](https://pytorch.org/executorch/stable
 
 ### Step 3: Install the demo requirements
 
-
 Python demo requirements:
+
 ```bash
 python -m pip install -r examples/models/yolo12/requirements.txt
 ```
 
 Demo infenrece dependency - OpenCV library:
-https://opencv.org/get-started/
-
-
-### Step 4: Export the Yolo12 model to the ExecuTorch
+<https://opencv.org/get-started/>
 
+### Step 4: Export the YOLO12 model to the ExecuTorch
 
 OpenVINO:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080]  --backend openvino --device CPU
 ```
 
 OpenVINO quantized model:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080]  --backend openvino --quantize --video_input /path/to/calibration/video --device CPU
 ```
 
 XNNPACK:
+
 ```bash
 python export_and_validate.py --model_name yolo12s --input_dims=[1920,1080] --backend xnnpack
 ```
 
-> **_NOTE:_**  Quantization for XNNPACK backend is WIP. Please refere to https://github.com/pytorch/executorch/issues/11523 for more details.
+> **_NOTE:_**  Quantization for XNNPACK backend is WIP. Please refere to <https://github.com/pytorch/executorch/issues/11523> for more details.
 
 Exported model could be validated using the `--validate` key:
 
@@ -70,8 +71,8 @@ python export_and_validate.py --model_name yolo12s --backend ... --validate data
 A list of available datasets and instructions on how to use a custom dataset can be found [here](https://docs.ultralytics.com/datasets/detect/).
 Validation only supports the default `--input_dims`; please do not specify this parameter when using the `--validate` flag.
 
-
 To get a full parameters description please use the following command:
+
 ```bash
 python export_and_validate.py --help
 ```
@@ -103,11 +104,11 @@ make -j$(nproc)
 ```
 
 To get a full parameters description please use the following command:
-```
+
+```bash
 ./build/Yolo12DetectionDemo --help
 ```
 
+## Credits
 
-# Credits:
-
-Ultralytics examples: https://github.com/ultralytics/ultralytics/tree/main/examples
+Ultralytics examples: <https://github.com/ultralytics/ultralytics/tree/main/examples>
diff --git a/examples/models/yolo12/export_and_validate.py b/examples/models/yolo12/export_and_validate.py
index e2349fb6434..ccd0db76d7d 100644
--- a/examples/models/yolo12/export_and_validate.py
+++ b/examples/models/yolo12/export_and_validate.py
@@ -35,7 +35,7 @@
 
 from ultralytics.data.utils import check_det_dataset
 from ultralytics.engine.validator import BaseValidator as Validator
-from ultralytics.utils.torch_utils import de_parallel
+from ultralytics.utils.torch_utils import unwrap_model
 
 
 class CV2VideoIter:
@@ -293,7 +293,7 @@ def _prepare_validation(
     stride = 32  # default stride
     validator.stride = stride  # used in get_dataloader() for padding
     validator.data = check_det_dataset(dataset_yaml_path)
-    validator.init_metrics(de_parallel(model))
+    validator.init_metrics(unwrap_model(model))
 
     data_loader = validator.get_dataloader(
         validator.data.get(validator.args.split), validator.args.batch
diff --git a/examples/nxp/README.md b/examples/nxp/README.md
index 4d9831c73f8..ef3153f2c91 100644
--- a/examples/nxp/README.md
+++ b/examples/nxp/README.md
@@ -1,20 +1,46 @@
-# PyTorch Model Delegation to Neutron Backend
+# ExecuTorch Neutron Backend examples
+This directory contains examples demonstrating the use of ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch
+format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend.
 
-In this guide we will show how to use the ExecuTorch AoT flow to convert a PyTorch model to ExecuTorch format and delegate the model computation to eIQ Neutron NPU using the eIQ Neutron Backend.
+## Layout
+* `experimental/` - contains CifarNet model example.
+* `models` - various example models.
+* `aot_neutron_compile.py` - script with end-to-end ExecuTorch AoT Neutron Backend workflow.
+* `README.md` - this file.
+* `run_aot_example.sh` - utility script for aot_neutron_compile.py.
+* `setup.sh` - setup script for Neutron Converter installation.
 
-First we will start with an example script converting the model. This example show the CifarNet model preparation. It is the same model which is part of the `example_cifarnet`
+## Setup
+Please finish tutorial [Setting up ExecuTorch](https://pytorch.org/executorch/main/getting-started-setup).
 
-The steps are expected to be executed from the executorch root folder.
-1. Run the setup.sh script to install the neutron-converter:
+Run the setup.sh script to install the neutron-converter:
 ```commandline
-$ examples/nxp/setup.sh
+$ ./examples/nxp/setup.sh
 ```
 
-2. Now run the `aot_neutron_compile.py` example with the `cifar10` model 
-```commandline
-$ python -m examples.nxp.aot_neutron_compile --quantize \
-    --delegate --neutron_converter_flavor SDK_25_06 -m cifar10 
-```
+## Supported models
+* CifarNet
+* MobileNetV2
+
+## PyTorch Model Delegation to Neutron Backend
+First we will start with an example script converting the model. This example show the CifarNet model preparation. 
+It is the same model which is part of the `example_cifarnet` in 
+[MCUXpresso SDK](https://www.nxp.com/design/design-center/software/development-software/mcuxpresso-software-and-tools-/mcuxpresso-software-development-kit-sdk:MCUXpresso-SDK).
+
+The NXP MCUXpresso software and tools offer comprehensive development solutions designed to help accelerate embedded 
+system development of applications based on MCUs from NXP. The MCUXpresso SDK includes a flexible set of peripheral 
+drivers designed to speed up and simplify development of embedded applications.
+
+The steps are expected to be executed from the `executorch` root folder.
+
+1. Run the `aot_neutron_compile.py` example with the `cifar10` model 
+    ```commandline
+    $ python -m examples.nxp.aot_neutron_compile --quantize \
+        --delegate --neutron_converter_flavor SDK_25_09 -m cifar10 
+    ```
 
-3. It will generate you `cifar10_nxp_delegate.pte` file which can be used with the MXUXpresso SDK `cifarnet_example` project, presented [here](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html#how-to-build-and-run-executorch-cifarnet-example).
-To get the MCUXpresso SDK follow this [guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/getting_mcuxpresso.html), use the MCUXpresso SDK v25.03.00. 
\ No newline at end of file
+2. It will generate you `cifar10_nxp_delegate.pte` file which can be used with the MCUXpresso SDK `cifarnet_example` 
+project, presented [here](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/example_applications.html#how-to-build-and-run-executorch-cifarnet-example).
+This project will guide you through the process of deploying your PTE model to the device.
+To get the MCUXpresso SDK follow this [guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/topics/getting_mcuxpresso.html),
+use the MCUXpresso SDK v25.09.00. 
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py
index 32344fb7ded..cb23f99a54d 100644
--- a/examples/nxp/aot_neutron_compile.py
+++ b/examples/nxp/aot_neutron_compile.py
@@ -15,8 +15,8 @@
 import executorch.kernels.quantized  # noqa F401
 
 import torch
-from executorch.backends.nxp.edge_passes.remove_io_quant_ops_pass import (
-    RemoveIOQuantOpsPass,
+from executorch.backends.nxp.edge_passes.neutron_edge_pass_manager import (
+    NeutronEdgePassManager,
 )
 from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner
 from executorch.backends.nxp.nxp_backend import generate_neutron_compile_spec
@@ -33,7 +33,6 @@
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 
 from .experimental.cifar_net.cifar_net import CifarNet, test_cifarnet_model
-
 from .models.mobilenet_v2 import MobilenetV2
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -164,9 +163,9 @@ def _get_batch_size(data):
         "-c",
         "--neutron_converter_flavor",
         required=False,
-        default="SDK_25_06",
+        default="SDK_25_09",
         help="Flavor of installed neutron-converter module. Neutron-converter module named "
-        "'neutron_converter_SDK_24_12' has flavor 'SDK_24_12'.",
+        "'neutron_converter_SDK_25_09' has flavor 'SDK_25_09'.",
     )
     parser.add_argument(
         "-q",
@@ -228,7 +227,7 @@ def _get_batch_size(data):
 
     module = exported_program.module()
 
-    # 4. Quantize if required
+    # 3. Quantize if required
     if args.quantize:
         if calibration_inputs is None:
             logging.warning(
@@ -254,39 +253,30 @@ def _get_batch_size(data):
         quantized_str = "quantized " if args.quantize else ""
         print(f"\nAccuracy of the {quantized_str}`{args.model_name}`: {accuracy}\n")
 
-    # 5. Export to edge program
-    partitioner_list = []
-    if args.delegate is True:
-        partitioner_list = [
-            NeutronPartitioner(
-                generate_neutron_compile_spec(
-                    args.target,
-                    args.neutron_converter_flavor,
-                    operators_not_to_delegate=args.operators_not_to_delegate,
-                )
-            )
-        ]
+    # 4. Transform and lower
+
+    compile_spec = generate_neutron_compile_spec(
+        args.target,
+        operators_not_to_delegate=args.operators_not_to_delegate,
+        neutron_converter_flavor=args.neutron_converter_flavor,
+    )
+    partitioners = [NeutronPartitioner(compile_spec)] if args.delegate else []
 
-    edge_program = to_edge_transform_and_lower(
+    edge_program_manager = to_edge_transform_and_lower(
         export(module, example_inputs, strict=True),
-        partitioner=partitioner_list,
-        compile_config=EdgeCompileConfig(
-            _check_ir_validity=False,
-        ),
+        partitioner=partitioners,
+        compile_config=EdgeCompileConfig(),
     )
-    logging.debug(f"Exported graph:\n{edge_program.exported_program().graph}")
 
-    if args.remove_quant_io_ops:
-        edge_program = edge_program.transform(
-            [RemoveIOQuantOpsPass(edge_program_manager=edge_program)]
-        )
-        logging.debug(
-            f"Exported graph (RemoveIOQuantOpsPass):\n{edge_program.exported_program().graph}"
-        )
+    edge_program_manager = NeutronEdgePassManager(
+        remove_io_quant_ops=args.remove_quant_io_ops
+    )(edge_program_manager)
+
+    logging.debug(f"Lowered graph:\n{edge_program_manager.exported_program().graph}")
 
-    # 6. Export to ExecuTorch program
+    # 5. Export to ExecuTorch program
     try:
-        exec_prog = edge_program.to_executorch(
+        exec_prog = edge_program_manager.to_executorch(
             config=ExecutorchBackendConfig(extract_delegate_segments=False)
         )
     except RuntimeError as e:
@@ -306,7 +296,7 @@ def executorch_program_to_str(ep, verbose=False):
 
     logging.debug(f"Executorch program:\n{executorch_program_to_str(exec_prog)}")
 
-    # 7. Serialize to *.pte
+    # 6. Serialize to *.pte
     model_name = f"{args.model_name}" + (
         "_nxp_delegate" if args.delegate is True else ""
     )
diff --git a/examples/nxp/run_aot_example.sh b/examples/nxp/run_aot_example.sh
index fa8c318778d..8f28c4f8143 100755
--- a/examples/nxp/run_aot_example.sh
+++ b/examples/nxp/run_aot_example.sh
@@ -13,6 +13,6 @@ cd $EXECUTORCH_DIR
 
 # Run the AoT example
 python -m examples.nxp.aot_neutron_compile --quantize \
-    --delegate --neutron_converter_flavor SDK_25_06 -m ${MODEL}
+    --delegate --neutron_converter_flavor SDK_25_09 -m ${MODEL}
 # verify file exists
 test -f ${MODEL}_nxp_delegate.pte
diff --git a/examples/nxp/setup.sh b/examples/nxp/setup.sh
index 74038220e74..5e85ed4edc5 100755
--- a/examples/nxp/setup.sh
+++ b/examples/nxp/setup.sh
@@ -7,4 +7,4 @@
 set -u
 
 # Install neutron-converter
-pip install --index-url https://eiq.nxp.com/repository neutron_converter_SDK_25_06
+pip install --index-url https://eiq.nxp.com/repository neutron_converter_SDK_25_09
diff --git a/examples/openvino/README.md b/examples/openvino/README.md
index 0ecedde092c..83e3daf6849 100644
--- a/examples/openvino/README.md
+++ b/examples/openvino/README.md
@@ -157,7 +157,7 @@ Build the backend libraries and executor runner by executing the script below in
 ```bash
 ./openvino_build.sh
 ```
-The executable is saved in `<executorch_root>/cmake-out/backends/openvino/`
+The executable is saved in `<executorch_root>/cmake-out/`
 
 ### Run the Example with Executor Runner
 
@@ -166,9 +166,9 @@ Now, run the example using the executable generated in the above step. The execu
 #### Command Syntax:
 
 ```
-cd ../../cmake-out/backends/openvino
+cd ../../cmake-out
 
-./openvino_executor_runner \
+./executor_runner \
     --model_path=<path_to_model> \
     --num_executions=<iterations>
 ```
@@ -182,7 +182,7 @@ cd ../../cmake-out/backends/openvino
 Run inference with a given model for 10 iterations:
 
 ```
-./openvino_executor_runner \
+./executor_runner \
     --model_path=model.pte \
     --num_executions=10
 ```
diff --git a/examples/portable/custom_ops/CMakeLists.txt b/examples/portable/custom_ops/CMakeLists.txt
index 4188554af79..8e679697b47 100644
--- a/examples/portable/custom_ops/CMakeLists.txt
+++ b/examples/portable/custom_ops/CMakeLists.txt
@@ -117,8 +117,14 @@ list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 add_executable(custom_ops_executor_runner ${_executor_runner__srcs})
 target_link_libraries(
-  custom_ops_executor_runner custom_ops_lib executorch extension_evalue_util
-  extension_runner_util gflags
+  custom_ops_executor_runner
+  custom_ops_lib
+  executorch
+  extension_evalue_util
+  extension_runner_util
+  gflags
+  extension_data_loader
+  extension_flat_tensor
 )
 target_compile_options(
   custom_ops_executor_runner PUBLIC ${_common_compile_options}
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 4f4208a5b53..0974e751203 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
@@ -50,6 +51,7 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(inputs, "", "Comma-separated list of input files");
 DEFINE_string(
     output_file,
@@ -72,6 +74,7 @@ DEFINE_int32(
 using executorch::aten::ScalarType;
 using executorch::aten::Tensor;
 using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -171,25 +174,65 @@ int main(int argc, char** argv) {
       "FileDataLoader::from() failed: 0x%" PRIx32,
       (uint32_t)loader.error());
 
+  // Load .ptd file if provided
+  std::unique_ptr<FileDataLoader> ptd_loader;
+  std::unique_ptr<FlatTensorDataMap> ptd_data_map;
+  if (!FLAGS_data_path.empty()) {
+    const char* data_path = FLAGS_data_path.c_str();
+    Result<FileDataLoader> ptd_loader_result = FileDataLoader::from(data_path);
+    ET_CHECK_MSG(
+        ptd_loader_result.ok(),
+        "FileDataLoader::from() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_loader_result.error());
+    ptd_loader =
+        std::make_unique<FileDataLoader>(std::move(ptd_loader_result.get()));
+    ET_LOG(Info, "PTD file %s is loaded.", data_path);
+
+    Result<FlatTensorDataMap> ptd_data_map_result =
+        FlatTensorDataMap::load(ptd_loader.get());
+    ET_CHECK_MSG(
+        ptd_data_map_result.ok(),
+        "FlatTensorDataMap::load() failed for PTD file: 0x%" PRIx32,
+        (uint32_t)ptd_data_map_result.error());
+    ptd_data_map = std::make_unique<FlatTensorDataMap>(
+        std::move(ptd_data_map_result.get()));
+    ET_LOG(
+        Info,
+        "PTD data map created with %" PRIu64 " keys.",
+        static_cast<uint64_t>(ptd_data_map->get_num_keys().get()));
+  }
+
   std::vector<std::string> inputs_storage;
   std::vector<std::pair<char*, size_t>> input_buffers;
 
   std::stringstream list_of_input_files(FLAGS_inputs);
-  std::string token;
+  std::string path;
+
+  // First reserve memory for number of vector elements to avoid vector
+  // reallocations when emplacing back.
+  std::vector<std::string> file_paths;
+  while (std::getline(list_of_input_files, path, ',')) {
+    file_paths.push_back(std::move(path));
+  }
+  inputs_storage.reserve(file_paths.size());
+
+  for (const auto& file_path : file_paths) {
+    std::ifstream input_file_handle(
+        file_path, std::ios::binary | std::ios::ate);
 
-  while (std::getline(list_of_input_files, token, ',')) {
-    std::ifstream input_file_handle(token, std::ios::binary | std::ios::ate);
     if (!input_file_handle) {
-      ET_LOG(Error, "Failed to open input file: %s\n", token.c_str());
+      ET_LOG(Error, "Failed to open input file: %s\n", file_path.c_str());
       return 1;
     }
 
     std::streamsize file_size = input_file_handle.tellg();
     input_file_handle.seekg(0, std::ios::beg);
 
+    // Reserve memory for actual file contents.
     inputs_storage.emplace_back(file_size, '\0');
+
     if (!input_file_handle.read(&inputs_storage.back()[0], file_size)) {
-      ET_LOG(Error, "Failed to read input file: %s\n", token.c_str());
+      ET_LOG(Error, "Failed to read input file: %s\n", file_path.c_str());
       return 1;
     }
 
@@ -282,7 +325,10 @@ int main(int argc, char** argv) {
   //
   EventTraceManager tracer;
   Result<Method> method = program->load_method(
-      method_name, &memory_manager, tracer.get_event_tracer());
+      method_name,
+      &memory_manager,
+      tracer.get_event_tracer(),
+      ptd_data_map.get());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index 0af45d85075..d1304a84bcb 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
         ],
         external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
             "//executorch/extension/threadpool:cpuinfo_utils",
             "//executorch/extension/threadpool:threadpool",
diff --git a/examples/qualcomm/README.md b/examples/qualcomm/README.md
index 355209f43a7..31443f2d356 100644
--- a/examples/qualcomm/README.md
+++ b/examples/qualcomm/README.md
@@ -111,12 +111,13 @@ This section outlines the essential APIs and utilities provided to streamline th
    Creates a clean directory for storing model outputs or intermediate results. If the directory already exists, it will be deleted and recreated to ensure a consistent environment for each run.
 
 ## Additional Dependency
+This example requires the following Python packages:
+- pandas and scikit-learn: used in the mobilebert multi-class text classification example.
+- graphviz (optional): used for visualizing QNN graphs during debugging.
 
-The mobilebert multi-class text classification example requires `pandas` and `sklearn`.
 Please install them by something like
-
 ```bash
-pip install scikit-learn pandas
+pip install scikit-learn pandas graphviz
 ```
 
 ## Limitation
diff --git a/examples/qualcomm/oss_scripts/README.md b/examples/qualcomm/oss_scripts/README.md
index b68024d5fbf..7971cc4a1de 100644
--- a/examples/qualcomm/oss_scripts/README.md
+++ b/examples/qualcomm/oss_scripts/README.md
@@ -15,6 +15,7 @@ The following models can be categorized based on their primary use cases.
 
 2. Vision Model:
    - conv_former
+   - convnext_small
    - cvt
    - deit
    - dino_v2
@@ -26,6 +27,7 @@ The following models can be categorized based on their primary use cases.
    - fbnet
    - focalnet
    - gMLP_image_classification
+   - maxvit_t
    - mobilevit1
    - mobilevit_v2
    - pvt
@@ -34,6 +36,8 @@ The following models can be categorized based on their primary use cases.
    - squeezenet
    - ssd300_vgg16
    - swin_transformer
+   - swin_v2_t
+   - vit_b_16
 
 ## Prerequisite
 Please follow another [README](../README.md) first to set up environment.
@@ -51,7 +55,7 @@ If you want to export the model without running it, please add `--compile_only`
       ```bash
       python albert.py -m ${SOC_MODEL} -b path/to/build-android/ -s ${DEVICE_SERIAL} -d path/to/wikisent2
 
-2. `conv_former`,`cvt`,`deit`,`dino_v2`,`efficientnet`,`fbnet`, `focalnet`, `gMLP_image_classification`,  `mobilevit1`,`mobilevit_v2`, `pvt`, `squeezenet`, `swin_transformer` :
+2. `conv_former`, `convnext_small`, `cvt`, `deit`, `dino_v2`, `efficientnet`, `fbnet`, `focalnet`, `gMLP_image_classification`, `maxvit_t`, `mobilevit1`, `mobilevit_v2`, `pvt`, `squeezenet`, `swin_transformer`, `swin_v2_t`, `vit_b_16` :
    - Required Dataset : ImageNet 
        
       Download [dataset](https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000) first, and place it in a valid folder.
diff --git a/examples/qualcomm/oss_scripts/convnext_small.py b/examples/qualcomm/oss_scripts/convnext_small.py
new file mode 100755
index 00000000000..491ffb0b7c3
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/convnext_small.py
@@ -0,0 +1,145 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+
+from multiprocessing.connection import Client
+
+import numpy as np
+
+import torch
+import torchvision
+
+from executorch.backends.qualcomm._passes.expand_broadcast_tensor_shape import (
+    ExpandBroadcastTensorShape,
+)
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    make_quantizer,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+
+
+def main(args):
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    if args.ci:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets = get_imagenet_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
+        )
+
+    pte_filename = "convnext_small_qnn_q8"
+    instance = torchvision.models.convnext_small(weights="IMAGENET1K_V1").eval()
+    passes_job = get_capture_program_passes()
+    passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
+    build_executorch_binary(
+        instance,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        custom_quantizer=make_quantizer(
+            quant_dtype=QuantDtype.use_8a8w,
+            per_channel_linear=True,
+        ),
+        passes_job=passes_job,
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=inputs)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./convnext_small",
+        default="./convnext_small",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    args.validate(args)
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index 1be94ec04d6..be25324d63d 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -5,12 +5,13 @@ This file provides you the instructions to run LLM Decoder model with different
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
  3. LLAMA3.2 3B
- 4. Gemma3 1B
- 5. Phi4-mini-instruct
- 6. QWEN2.5 0.5B / 1.5B
- 7. QWEN3 0.6B / 1.7B
- 8. SmolLM2 135M
- 9. SmolLM3 3B
+ 4. Gemma 2B
+ 5. Gemma3 1B
+ 6. Phi4-mini-instruct
+ 7. QWEN2.5 0.5B / 1.5B
+ 8. QWEN3 0.6B / 1.7B
+ 9. SmolLM2 135M
+ 10. SmolLM3 3B
  
 
 We offer the following modes to execute the model:
@@ -37,6 +38,7 @@ We offer the following modes to execute the model:
 ### Step 1: Setup
 1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
 2. Follow the [tutorial](https://pytorch.org/executorch/main/backends-qualcomm) to build Qualcomm AI Engine Direct Backend.
+3. Please install the llm eval dependency via [examples/models/llama/install_requirements.sh](https://github.com/pytorch/executorch/blob/main/examples/models/llama/install_requirements.sh)
 
 ### Step 2: Prepare Model
 
@@ -78,6 +80,13 @@ Default example using kv mode.
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --decoder_model llama3_2-3b_instruct --model_mode kv --max_seq_len 1024 --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
 ```
 
+#### Gemma 2B
+Default example using hybrid mode
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --temperature 0 --model_mode hybrid --max_seq_len 1024 --prefill_ar_len 128 --decoder_model gemma-2b --prompt "I would like to learn python, could you teach me with a simple example?" --tasks wikitext --limit 1
+```
+
+
 #### Gemma3 1B
 Default example using hybrid mode
 ```bash
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
index 10462595c56..51315df3ed2 100644
--- a/examples/qualcomm/oss_scripts/llama/TARGETS
+++ b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -26,6 +26,16 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "masking_utils",
+    srcs = [
+        "masking_utils.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+    ],
+)
+
 runtime.python_library(
     name = "decoder_constants",
     srcs = [
@@ -39,6 +49,7 @@ runtime.python_library(
     deps = [
         ":decoder_constants",
         ":decoder_utils",
+        ":masking_utils",
         "//executorch/examples/models/llama:source_transformation",
         "//caffe2:torch",
         "//executorch/backends/qualcomm/partition:partition",
@@ -90,6 +101,7 @@ python_binary(
         "//executorch/examples/qualcomm/oss_scripts/llama:range_setting_pt2e",
         "fbsource//third-party/pypi/lm-eval:lm-eval",
     ],
+    keep_gpu_sections = True,
 )
 
 runtime.command_alias(
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 5908fcf32a6..628defc1496 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -24,6 +24,7 @@
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 
+from executorch.examples.models.gemma import convert_weights as convert_gemma_weights
 from executorch.examples.models.gemma3 import convert_weights as convert_gemma3_weights
 from executorch.examples.models.phi_4_mini import (
     convert_weights as convert_phi_4_mini_weights,
@@ -300,6 +301,36 @@ class Llama3_2_3B_Instruct(LLMModelConfig):
     )
 
 
+@register_llm_model("gemma-2b")
+@dataclass(init=False, frozen=True)
+class Gemma_2B(LLMModelConfig):
+    repo_id: str = "google/gemma-2b-it"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/gemma/config/2b_config.json"
+    )
+    convert_weights = convert_gemma_weights
+    transform_weight = False
+    instruct_model = True
+
+    num_sharding = 4
+    # quant config
+    ptq = QuantDtype.use_16a4w_block
+    group_size = 64
+    masked_softmax = True
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    quantization_config_wv_sha_16a8w = get_ptq_per_channel_quant_config(
+        torch.uint16, weight_dtype=torch.int8, act_observer=MinMaxObserver
+    )
+    custom_annotation = (
+        annotate_kv_8bit,
+        annotate_output_16a8w,
+        partial(annotate_wv_sha, quantization_config=quantization_config_wv_sha_16a8w),
+    )
+
+
 @register_llm_model("gemma3-1b")
 @dataclass(init=False, frozen=True)
 class Gemma3(LLMModelConfig):
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
index ac96770b889..d43ceb8351a 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -14,6 +14,7 @@
 DECODER_MODEL_VERSION = {
     "stories260k": "llama2",
     "stories110m": "llama2",
+    "gemma-2b": "gemma",
     "gemma3-1b": "gemma3",
     "phi_4_mini": "phi_4_mini",
     "llama3_2-1b_instruct": "llama3",
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
index 76cf85c6e9c..6a4d00a5308 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -7,7 +7,8 @@
 import getpass
 import logging
 import os
-from typing import Callable, Optional, Union
+from collections import defaultdict, OrderedDict
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -100,6 +101,155 @@ def _model_call(self, inps):
         return all_logits
 
 
+class LookaheadDecoder:
+    """
+    Lookahead decoding to speed up calibration
+    """
+
+    class NgramPool:
+        def __init__(self, num_verifications: int):
+            self.pool = defaultdict(OrderedDict)
+            # keep the amount of ngrams as number of verification branches for simplicity
+            self.num_verifications = num_verifications
+
+        def add(self, ngram: Tuple[int]):
+            key = ngram[0]
+            # since there is no OrderedSet in python, use OrderedDict with dummy value 1
+            self.pool[key][ngram[1:]] = 1
+            if len(self.pool[key]) > self.num_verifications:
+                # remove cache in FIFO fashion
+                self.pool[key].popitem(last=False)
+
+        def __getitem__(self, key):
+            return self.pool[key]
+
+        def __iter__(self):
+            return iter(self.pool)
+
+    def __init__(
+        self,
+        window_size: int,
+        ngram_size: int,
+        num_verifications: int,
+        ar_size: int,
+        mask_value: int,
+    ):
+        if ar_size < (ngram_size - 1) * (window_size + num_verifications):
+            raise ValueError(
+                "AR length is not enough to meet requirement. "
+                "Should be at least (ngram_size - 1) * (window_size + num_verifications)."
+            )
+
+        self.window_size = window_size
+        self.ngram_size = ngram_size
+        self.ngram_pool = self.NgramPool(num_verifications)
+        self.num_verifications = num_verifications
+        self.verification_offset = window_size * (ngram_size - 1)
+        self.ar_size = ar_size
+        self.mask_value = mask_value
+
+    @property
+    def attention_mask(self) -> torch.Tensor:
+        mask = torch.full((self.ar_size,) * 2, self.mask_value)
+        lookahead_branch_mask = torch.triu(
+            torch.full((self.window_size,) * 2, self.mask_value),
+            diagonal=1,
+        )
+        for i in range(self.ngram_size - 1):
+            mask[
+                i * self.window_size : (i + 1) * self.window_size,
+                : self.window_size,
+            ] = lookahead_branch_mask
+            for j in range(1, i + 1):
+                mask[
+                    i * self.window_size : (i + 1) * self.window_size,
+                    j * self.window_size : (j + 1) * self.window_size,
+                ].fill_diagonal_(0)
+
+        verification_branch_mask = torch.triu(
+            torch.full((self.ngram_size - 1,) * 2, self.mask_value),
+            diagonal=1,
+        )
+        for i in range(self.num_verifications):
+            indices = [i * (self.ngram_size - 1), (i + 1) * (self.ngram_size - 1)]
+            slices = (slice(*[ind + self.verification_offset for ind in indices]),) * 2
+            mask[slices] = verification_branch_mask
+        mask[
+            : self.verification_offset + (self.ngram_size - 1) * self.num_verifications,
+            0,
+        ] = 0
+
+        return mask
+
+    @property
+    def position_offset(self) -> torch.Tensor:
+        offsets = torch.zeros(self.ar_size, dtype=torch.int32)
+        idx = 0
+        # lookahead branches
+        for i in range(self.ngram_size - 1):
+            for j in range(self.window_size):
+                offsets[idx] = i + j
+                idx += 1
+
+        # verification branches
+        for _ in range(self.num_verifications):
+            for j in range(1, self.ngram_size):
+                offsets[idx] = j
+                idx += 1
+
+        return offsets
+
+    def update_verification_branch(self, guess_token: int, inputs: List[int]) -> None:
+        for branch, ngram in enumerate(self.ngram_pool[guess_token]):
+            verification_offset = self.verification_offset + branch * (
+                self.ngram_size - 1
+            )
+            for i, token in enumerate(ngram):
+                inputs[verification_offset + i] = token
+
+    def update_lookahead_branch(self, inputs: List[int], outputs: List[int]) -> None:
+        # 1 level shifting
+        for i in range(self.ngram_size - 2):
+            for j in range(self.window_size):
+                inputs[self.window_size * i + j] = inputs[
+                    self.window_size * (i + 1) + j
+                ]
+
+        last_ngram_offset = self.window_size * (self.ngram_size - 2)
+        for i in range(self.window_size):
+            inputs[last_ngram_offset + i] = outputs[last_ngram_offset + i]
+
+    def update_ngram_pool(self, inputs: List[int], outputs: List[int]) -> None:
+        for i in range(self.window_size):
+            ngram = [inputs[i]]
+            for j in range(1, self.ngram_size - 1):
+                ngram.append(inputs[i + j * self.window_size])
+
+            ngram.append(outputs[i + self.window_size * (self.ngram_size - 2)])
+            self.ngram_pool.add(tuple(ngram))
+
+    def verify(
+        self, inputs: List[int], outputs: List[int]
+    ) -> Tuple[List[int], Optional[int]]:
+        best_match, branch = [], None
+        for i in range(self.num_verifications):
+            current_match = [outputs[0]]
+            verification_branch_offset = (
+                self.verification_offset + (self.ngram_size - 1) * i
+            )
+            for j in range(self.ngram_size - 1):
+                if inputs[verification_branch_offset + j] == current_match[-1]:
+                    current_match.append(outputs[verification_branch_offset + j])
+                else:
+                    break
+
+            if len(current_match[1:]) > len(best_match):
+                best_match = current_match[1:]
+                branch = i
+
+        return best_match, branch
+
+
 class QnnRunnerEvalWrapper(EagerEvalWrapper):
     """
     A wrapper class to run PPL scores with QNN on device.
@@ -248,18 +398,30 @@ def smart_mask_updater(
     v_caches,
     new_k_caches,
     new_v_caches,
+    # lookahead decoding related
+    lade_token_offset=None,
+    lade_pos_offset=None,
 ):
     # ar_len is unused in smart mask
     max_cache_len = k_caches[0].size(-1)
+
     if pos + n_updates <= max_cache_len:
-        for i, k_cache in enumerate(k_caches):
-            k_cache[:, :, pos : pos + n_updates] = new_k_caches[i][:, :, :n_updates]
+        if lade_token_offset is not None:
+            # lookahead decode update
+            for i, offset in enumerate(lade_token_offset):
+                current_pos = pos + i
+                for j, (k_cache, v_cache) in enumerate(zip(k_caches, v_caches)):
+                    k_cache[:, :, current_pos] = new_k_caches[j][:, :, offset]
+                    v_cache[:, current_pos, :] = new_v_caches[j][:, offset, :]
+        else:
+            for i, k_cache in enumerate(k_caches):
+                k_cache[:, :, pos : pos + n_updates] = new_k_caches[i][:, :, :n_updates]
+            for i, v_cache in enumerate(v_caches):
+                v_cache[:, pos : pos + n_updates, :] = new_v_caches[i][:, :n_updates, :]
 
-        for i, v_cache in enumerate(v_caches):
-            v_cache[:, pos : pos + n_updates, :] = new_v_caches[i][:, :n_updates, :]
-        atten_mask.smart_mask_update(pos, n_updates)
-    pos += n_updates
+        atten_mask.smart_mask_update(pos, n_updates, lade_pos_offset)
 
+    pos += n_updates
     return pos, k_caches, v_caches
 
 
@@ -271,29 +433,51 @@ def shift_pointer_updater(
     v_caches,
     new_k_caches,
     new_v_caches,
+    # lookahead decoding related
+    lade_token_offset=None,
+    lade_pos_offset=None,
 ):
     max_cache_len = k_caches[0].size(-1)
     if pos + n_updates <= max_cache_len:
-        k_caches = [
-            torch.cat(
-                [k_cache[:, :, n_updates:], new_k_caches[i][:, :, :n_updates]], dim=-1
-            )
-            for i, k_cache in enumerate(k_caches)
-        ]
-        v_caches = [
-            torch.cat(
-                [v_cache[:, n_updates:, :], new_v_caches[i][:, :n_updates, :]], dim=1
-            )
-            for i, v_cache in enumerate(v_caches)
-        ]
-        atten_mask.shift_pointer_update(pos, n_updates)
-    pos += n_updates
+        if lade_token_offset is not None:
+            # lookahead decode update
+            for offset in lade_token_offset:
+                for i, (k_cache, v_cache) in enumerate(zip(k_caches, v_caches)):
+                    k_caches[i] = torch.cat(
+                        [
+                            k_cache[:, :, 1:],
+                            new_k_caches[i][:, :, offset].unsqueeze(-1),
+                        ],
+                        dim=-1,
+                    )
+                    v_caches[i] = torch.cat(
+                        [v_cache[:, 1:, :], new_v_caches[i][:, offset, :].unsqueeze(1)],
+                        dim=1,
+                    )
+        else:
+            k_caches = [
+                torch.cat(
+                    [k_cache[:, :, n_updates:], new_k_caches[i][:, :, :n_updates]],
+                    dim=-1,
+                )
+                for i, k_cache in enumerate(k_caches)
+            ]
+            v_caches = [
+                torch.cat(
+                    [v_cache[:, n_updates:, :], new_v_caches[i][:, :n_updates, :]],
+                    dim=1,
+                )
+                for i, v_cache in enumerate(v_caches)
+            ]
+
+        atten_mask.shift_pointer_update(pos, n_updates, lade_pos_offset)
 
+    pos += n_updates
     return pos, k_caches, v_caches
 
 
 @register_inference(use_kv_cache=True)
-def kv_inference(
+def kv_inference(  # noqa: C901
     get_example_inputs,
     prompt: Union[str, list],
     module: torch.fx.GraphModule,
@@ -304,6 +488,7 @@ def kv_inference(
     use_i64_token=False,
     collect_logits=False,
     seq_mse_candidates=0,
+    lookahead_config=None,
 ):
     _, atten_mask, _, k_caches, v_caches = get_example_inputs(use_kv_cache=True)
 
@@ -393,46 +578,125 @@ def kv_inference(
         # When run on wikitext for ppl evaluation, this while-loop is not expected to run.
         max_cache_len = max_seq_len - ar_len
         num_tokens = len(total_token_list)
-        while total_token_list[-1] != tokenizer.eos_id and num_tokens < max_seq_len:
-            chunk_start_idx = min(pos, max_cache_len)
-            # Take a chunk of generated tokens, up to ar_len length.
-            chunk_end_idx = num_tokens
-            actual_chunk_tokens = total_token_list[chunk_start_idx:chunk_end_idx]
-            num_tokens_in_chunk = len(actual_chunk_tokens)
-
-            # Prepare tmp_token_list (padded with zeros).
-            tmp_token_list = torch.zeros((1, ar_len), dtype=dtype)
-            tmp_token_list[0, :num_tokens_in_chunk] = torch.tensor(
-                actual_chunk_tokens, dtype=dtype
-            )
-
-            # Prepare tmp_pos (padded with zeros).
-            tmp_pos = torch.zeros((1, ar_len), dtype=torch.int32)
-            tmp_pos[0, :num_tokens_in_chunk] = all_pos[0, chunk_start_idx:chunk_end_idx]
+        if lookahead_config is None:
+            while total_token_list[-1] != tokenizer.eos_id and num_tokens < max_seq_len:
+                chunk_start_idx = min(pos, max_cache_len)
+                # Take a chunk of generated tokens, up to ar_len length.
+                chunk_end_idx = num_tokens
+                actual_chunk_tokens = total_token_list[chunk_start_idx:chunk_end_idx]
+                num_tokens_in_chunk = len(actual_chunk_tokens)
+
+                # Prepare tmp_token_list (padded with zeros).
+                tmp_token_list = torch.zeros((1, ar_len), dtype=dtype)
+                tmp_token_list[0, :num_tokens_in_chunk] = torch.tensor(
+                    actual_chunk_tokens, dtype=dtype
+                )
 
-            logits, new_k_caches, new_v_caches = module(
-                tmp_token_list,
-                *atten_mask,
-                tmp_pos,
-                *k_caches,
-                *v_caches,
-            )
-            if collect_logits:
-                result_logits.append(logits[:, :num_tokens_in_chunk])
+                # Prepare tmp_pos (padded with zeros).
+                tmp_pos = torch.zeros((1, ar_len), dtype=torch.int32)
+                tmp_pos[0, :num_tokens_in_chunk] = all_pos[
+                    0, chunk_start_idx:chunk_end_idx
+                ]
+
+                logits, new_k_caches, new_v_caches = module(
+                    tmp_token_list,
+                    *atten_mask,
+                    tmp_pos,
+                    *k_caches,
+                    *v_caches,
+                )
 
-            pos, k_caches, v_caches = kv_updater(
-                1,
-                atten_mask,
-                pos,
-                k_caches,
-                v_caches,
-                new_k_caches,
-                new_v_caches,
+                pos, k_caches, v_caches = kv_updater(
+                    1,
+                    atten_mask,
+                    pos,
+                    k_caches,
+                    v_caches,
+                    new_k_caches,
+                    new_v_caches,
+                )
+                total_token_list.append(
+                    torch.argmax(logits[:, num_tokens_in_chunk - 1], dim=-1).item()
+                )
+                num_tokens = len(total_token_list)
+        else:
+            # TODO: support batch decode if necessary
+            # variable declaration
+            window, ngram, gcap = lookahead_config
+            lade = LookaheadDecoder(
+                window_size=window,
+                ngram_size=ngram,
+                num_verifications=gcap,
+                ar_size=ar_len,
+                mask_value=next(iter(atten_mask)).min().item(),
             )
-            total_token_list.append(
-                torch.argmax(logits[:, num_tokens_in_chunk - 1], dim=-1).item()
+            generated_tokens, accepted_tokens = 0, 0
+            input_tokens = [total_token_list[-1]] * ar_len
+            pos_offsets = lade.position_offset.unsqueeze(0)
+            pos_offsets_list = pos_offsets.flatten().tolist()
+            # replace ar attention mask to lookahead version
+            for mask in atten_mask:
+                mask[:, :, -ar_len:] = lade.attention_mask.unsqueeze(0)
+            # start decoding
+            while (
+                total_token_list[-1] != tokenizer.eos_id
+                and len(total_token_list) < max_cache_len
+            ):
+                # populate verification branch
+                lade.update_verification_branch(
+                    guess_token=input_tokens[0],
+                    inputs=input_tokens,
+                )
+                # inference
+                logits, new_k_caches, new_v_caches = module(
+                    torch.tensor(input_tokens, dtype=dtype).unsqueeze(0),
+                    *atten_mask,
+                    pos_offsets + pos,
+                    *k_caches,
+                    *v_caches,
+                )
+                # collect outputs
+                output_tokens = torch.argmax(logits, dim=-1).flatten().tolist()
+                # update ngram pool
+                lade.update_ngram_pool(inputs=input_tokens, outputs=output_tokens)
+                # try matching verification branches
+                best_match, branch_no = lade.verify(
+                    inputs=input_tokens, outputs=output_tokens
+                )
+                # check if any match was found
+                lade_token_offset, num_match = [0], len(best_match)
+                if num_match > 0:
+                    accepted_tokens += num_match
+                    lade_token_offset += [
+                        e + lade.verification_offset + branch_no * (ngram - 1)
+                        for e in range(num_match)
+                    ]
+                # update kv cache
+                pos, k_caches, v_caches = kv_updater(
+                    len(lade_token_offset),
+                    atten_mask,
+                    pos,
+                    k_caches,
+                    v_caches,
+                    new_k_caches,
+                    new_v_caches,
+                    lade_token_offset,
+                    pos_offsets_list,
+                )
+                generated_tokens += len(lade_token_offset)
+                # update lookahead branch
+                lade.update_lookahead_branch(inputs=input_tokens, outputs=output_tokens)
+                # update token list
+                for token in [output_tokens[0], *best_match]:
+                    total_token_list.append(token)
+                    if token == tokenizer.eos_id:
+                        break
+                # fill next input token
+                input_tokens[0] = total_token_list[-1]
+
+            logging.info(
+                f"lookahead accepted / total generated: {accepted_tokens} / {generated_tokens}"
             )
-            num_tokens = len(total_token_list)
 
     logging.info(f"kv inference result:\n{tokenizer.decode(total_token_list)}")
     if collect_logits:
@@ -494,8 +758,8 @@ def prefill_inference(
             if collect_logits:
                 result_logits = logits[:, :pos]
             pos += 1
-
-    logging.info(f"prefill inference result:\n{tokenizer.decode(token_list)}")
+    if isinstance(prompt, str):
+        logging.info(f"prefill inference result:\n{tokenizer.decode(token_list)}")
     return result_logits
 
 
@@ -514,6 +778,7 @@ def graph_module_inference(
     use_i64_token=False,
     event_name: Optional[str] = None,
     seq_mse_candidates: int = 0,
+    lookahead_config: Optional[Tuple[int]] = None,
 ):
     """
     This function supports model execution from static nn.Module decoder model
@@ -529,6 +794,8 @@ def graph_module_inference(
         if use_kv_cache:
             kwargs["ar_len"] = ar_len
             kwargs["kv_updater"] = kv_updater
+            kwargs["lookahead_config"] = lookahead_config
+
         INFERENCE_REGISTRY[use_kv_cache](
             get_example_inputs,
             prompt,
diff --git a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
index b25e0cbdc7d..9af9cdf9549 100644
--- a/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
+++ b/examples/qualcomm/oss_scripts/llama/eval_llama_qnn.py
@@ -4,44 +4,40 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import argparse
+""" Utilities for running fast evals (using prefill mode version of model) on eager-quantized model and QDQ model, for experimentation purposes. """
+
 import json
 
 import logging
 import sys
 import types
-from functools import partial
 
 import torch
-from executorch.backends.qualcomm.quantizer.custom_annotation import (
-    annotate_kv_8bit,
-    annotate_output_16a8w,
-    annotate_qkv_proj_sha,
-    StaticLLMQuantConfig,
-)
 
 from executorch.backends.qualcomm.quantizer.observers.per_channel_param_observer import (
     PerChannelParamObserver,
 )
 from executorch.backends.qualcomm.quantizer.qconfig import (
     _derived_bias_quant_spec,
-    get_ptq_per_channel_quant_config,
     QuantizationConfig,
 )
 
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.utils.utils import convert_linear_to_conv2d
 
-from executorch.examples.models.llama.eval_llama_lib import (
-    build_args_parser,
-    GraphModuleEvalWrapper,
+from executorch.examples.models.llama.eval_llama_lib import build_args_parser
+from executorch.examples.models.llama.hf_download import (
+    download_and_convert_hf_checkpoint,
 )
 
 from executorch.examples.models.llama.source_transformation.quantize import (
     get_quant_embedding_transform,
 )
+from executorch.examples.qualcomm.oss_scripts.llama import SUPPORTED_LLM_MODELS
 
-from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import calibrate
+from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
+    graph_module_inference,
+)
 
 from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
     LlamaModel,
@@ -55,13 +51,17 @@
     WrappedLlamaModel,
 )
 from lm_eval.evaluator import simple_evaluate
-
-from pytorch_tokenizers import get_tokenizer
+from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer
+from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
+from torchao.prototype.quantization.module_swap.module_swap import (
+    QuantizationRecipe,
+    quantize_module_swap,
+)
 from torchao.prototype.spinquant import apply_spinquant
-from torchao.quantization.pt2e import MinMaxObserver
 
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torchao.quantization.pt2e.quantizer import QuantizationSpec
+from transformers import AutoTokenizer
 
 
 sys.setrecursionlimit(4096)
@@ -97,13 +97,58 @@ def add_mse_weight_observer(quant_dtype, quantizer):
     )
 
 
-def prepare_model(model_name, args):
-    with open(args.params) as f:
+def prepare_tokenizer(args):
+    runtime_tokenizer_path = ""
+    if args.decoder_model in {"stories110m", "stories260k"}:
+        tokenizer = get_tokenizer(args.tokenizer_model)
+        assert isinstance(
+            tokenizer, SentencePieceTokenizer
+        ), "Wrong tokenizer provided for stories."
+        assert (
+            args.tokenizer_bin is not None
+        ), "Please provide tokenizer_bin for stories."
+        runtime_tokenizer_path = args.tokenizer_bin
+    elif "llama3_2" in args.decoder_model:
+        tokenizer = get_tokenizer(args.tokenizer_model)
+        assert isinstance(
+            tokenizer, TiktokenTokenizer
+        ), "Wrong tokenizer provided for llama3_2."
+        runtime_tokenizer_path = args.tokenizer_model
+    elif args.decoder_model == "phi_4_mini":
+        model_id = SUPPORTED_LLM_MODELS[args.decoder_model].repo_id
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
+        tokenizer = get_tokenizer(runtime_tokenizer_path)
+        with open(runtime_tokenizer_path, "r+") as file:
+            data = json.load(file)
+            # TODO: Encountered the following error during runtime, so switched behavior for now.
+            # Error: libc++abi: terminating due to uncaught exception of type std::runtime_error: invert=true is not supported for Split PreTokenizer. Only invert=false is supported.
+            data["pre_tokenizer"]["pretokenizers"][-2]["invert"] = False
+            file.seek(0)
+            json.dump(data, file, indent=4)
+            file.truncate()
+    elif args.decoder_model in SUPPORTED_LLM_MODELS:
+        model_id = SUPPORTED_LLM_MODELS[args.decoder_model].repo_id
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        runtime_tokenizer_path = tokenizer.save_pretrained(args.artifact)[-1]
+        tokenizer = get_tokenizer(runtime_tokenizer_path)
+    else:
+        raise RuntimeError(f"Unknown decoder_model: {args.decoder_model}.")
+    return tokenizer
+
+
+def prepare_model(args):
+    if args.params:
+        params_path = args.params
+    else:
+        params_path = SUPPORTED_LLM_MODELS[args.decoder_model].params_path
+    with open(params_path) as f:
         prefill_config = ModelArgs(**json.load(f))
-        # TODO: support batch inputs if necessary
-        prefill_config.max_batch_size = 1
-        prefill_config.max_seq_len = args.max_seq_length
-        prefill_config.use_kv_cache = False
+    # TODO: support batch inputs if necessary
+    prefill_config.max_batch_size = 1
+    prefill_config.max_seq_len = args.max_seq_length
+    prefill_config.use_kv_cache = False
+    prefill_config.enable_r3 = args.r3
     use_i64_token = args.embedding_quantize is not None
     model = LlamaModel(
         prefill_config,
@@ -112,47 +157,69 @@ def prepare_model(model_name, args):
         output_cache=False,
         use_i64_token=use_i64_token,
     )
-    state_dict = torch.load(
-        args.checkpoint, weights_only=True, map_location=args.device, mmap=True
-    )
-
-    # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
-    def permute(w, heads):
-        dim_0 = w.size(0)
-        dim_1 = w.size(1)
-        return (
-            w.view(heads, dim_0 // heads // 2, 2, dim_1)
-            .transpose(1, 2)
-            .reshape(dim_0, dim_1)
+    if args.checkpoint is None:  # HF models
+        checkpoint = download_and_convert_hf_checkpoint(
+            SUPPORTED_LLM_MODELS[args.decoder_model].repo_id,
+            SUPPORTED_LLM_MODELS[args.decoder_model].convert_weights.__func__,
         )
-
-    n_heads = model.n_heads
-    n_kv_heads = model.n_kv_heads
-    n_layers = model.n_layers
-
-    for layer_i in range(n_layers):
-        state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute(
-            state_dict[f"layers.{layer_i}.attention.wq.weight"], n_heads
+        state_dict = torch.load(
+            checkpoint, weights_only=True, map_location=args.device, mmap=True
         )
-        state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute(
-            state_dict[f"layers.{layer_i}.attention.wk.weight"], n_kv_heads
+        transform_weight = SUPPORTED_LLM_MODELS[args.decoder_model].transform_weight
+    else:
+        state_dict = torch.load(
+            args.checkpoint, weights_only=True, map_location=args.device, mmap=True
         )
 
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+
+        if args.decoder_model == "stories260k":
+            state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+        transform_weight = True
+
+    if transform_weight:
+        # Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
+        def permute(w, heads):
+            dim_0 = w.size(0)
+            dim_1 = w.size(1)
+            return (
+                w.view(heads, dim_0 // heads // 2, 2, dim_1)
+                .transpose(1, 2)
+                .reshape(dim_0, dim_1)
+            )
+
+        n_heads = model.n_heads
+        n_kv_heads = model.n_kv_heads
+        n_layers = model.n_layers
+
+        for layer_i in range(n_layers):
+            state_dict[f"layers.{layer_i}.attention.wq.weight"] = permute(
+                state_dict[f"layers.{layer_i}.attention.wq.weight"], n_heads
+            )
+            state_dict[f"layers.{layer_i}.attention.wk.weight"] = permute(
+                state_dict[f"layers.{layer_i}.attention.wk.weight"], n_kv_heads
+            )
+
     model.load_state_dict(
         state_dict,
         strict=True,
         assign=True,
     )
+    return model, prefill_config
 
-    if "model" in state_dict:
-        state_dict = state_dict["model"]
 
+def prequant_algorithm(model, prefill_config, args):
     # TODO: use dtype of model checkpoint
     model = model.to(device=args.device, dtype=torch.float)
     inputs = model.get_example_inputs(use_kv_cache=False)
     tokens, atten_mask = inputs
+    tokens.to(args.device)
+    for mask in atten_mask.masks:
+        mask.mask.to(args.device)
 
     scales_state_dict = {}
+
     if args.spinquant:
         config = types.SimpleNamespace(
             dim=prefill_config.dim,
@@ -173,7 +240,7 @@ def permute(w, heads):
 
     if args.range_setting == "mse_with_act_loss":
         wrapped_model = WrappedLlamaModel(
-            model, atten_mask, args.use_kv_cache, args.max_seq_length, args.device
+            model, *atten_mask, args.use_kv_cache, args.max_seq_length, args.device
         )
         act_bits, weight_bits = {
             "8a8w": (8, 8),
@@ -201,31 +268,55 @@ def permute(w, heads):
     return model, prefill_config, inputs, scales_state_dict
 
 
-def gen_eval_wrapper(model_name, args):
-    tokenizer = get_tokenizer(args.tokenizer_path)
-    model, config, inputs, scales_state_dict = prepare_model(model_name, args)
-    tokens, atten_mask = inputs
+def eager_eval_quanty(
+    model,
+    weight_bits,
+    act_bits,
+    embedding_quantization,
+    dynamic_activations=False,
+    dynamic_weights=False,
+):
+    """
+    Run evaluations where we quantize only linear layers with Quanty (eager-mode module swap quantization flow)
+    Although when lowering to Qualcomm backend using the PT2E flow we quantize all (not just linear) layers,
+    Quanty flow is fast and can be used for rapid experimentation.
+    """
+
+    recipe = QuantizationRecipe(
+        weight_bits=weight_bits,
+        weight_quantization=True,
+        dynamic_weights=dynamic_weights,
+        weight_group_size="per_channel",
+        activation_bits=act_bits,
+        activation_quantization=True,
+        activation_group_size="per_tensor",
+        input_quantization=True,
+        output_quantization=True,
+        dynamic_activations=dynamic_activations,
+        embedding_quantization=embedding_quantization,
+    )
+
+    quantized_model = quantize_module_swap(model, recipe)
+    simple_evaluate(
+        model=model,
+        tasks=["wikitext"],
+    )
+
+    reverse_quantize_module_swap(quantized_model)
+
+
+def eval_llm(args):
+    tokenizer = prepare_tokenizer(args)
+    model, prefill_config = prepare_model(args)
+    model, config, inputs, scales_state_dict = prequant_algorithm(
+        model, prefill_config, args
+    )
     use_i64_token = args.embedding_quantize is not None
 
     if args.ptq is not None:
         quant_dtype = getattr(QuantDtype, f"use_{args.ptq}")
-
-        quantization_config_wv_sha_8a4w = get_ptq_per_channel_quant_config(
-            act_dtype=torch.uint8,
-            weight_dtype=torch.int4,
-            act_observer=MinMaxObserver,
-            act_symmetric=True,
-        )
-        custom_annotations = (
-            annotate_kv_8bit,
-            partial(
-                annotate_qkv_proj_sha,
-                qkv_tags={StaticLLMQuantConfig.wv_sha},
-                quantization_config=quantization_config_wv_sha_8a4w,
-            ),
-        )
-        if args.llama_model == "stories110m":
-            custom_annotations = custom_annotations + (annotate_output_16a8w,)
+        decoder_model_config = SUPPORTED_LLM_MODELS[args.decoder_model]
+        custom_annotations = decoder_model_config.custom_annotation
 
         quantizer = make_custom_quantizer(
             quant_dtype, args.range_setting, custom_annotations, args.quant_linear_only
@@ -233,7 +324,9 @@ def gen_eval_wrapper(model_name, args):
 
         with torch.no_grad():
             logging.info("Starting export...")
-            model = torch.export.export(model, inputs, strict=True).module()
+            model = torch.export.export(
+                model, (inputs[0], *inputs[1]), strict=True
+            ).module()
             if quant_dtype == QuantDtype.use_16a4w_block:
                 conv_nodes = [n for n in model.graph.nodes if "conv" in n.name]
                 block_size_map = {n.name: (1, 64, 1, 1) for n in conv_nodes}
@@ -242,16 +335,18 @@ def gen_eval_wrapper(model_name, args):
             model = prepare_pt2e(model, quantizer)
 
         logging.info("Observers added, starting calibration...")
-
-        calibrate(
-            inputs,
-            "Once upon a time",
-            model,
+        graph_module_inference(
+            use_kv_cache=False,
+            get_example_inputs=lambda use_kv_cache=False: inputs,
+            module=model,
             tokenizer=tokenizer,
-            ar_len=args.prefill_ar_len,
+            ar_len=args.max_seq_len,
             max_seq_len=args.max_seq_len,
-            kv_updater=None,
+            kv_updater=args.kv_updater,
+            tasks=["wikitext"],
+            tasks_limit=1,
             use_i64_token=use_i64_token,
+            event_name="prepare_pt2e_prompt",
         )
 
         if args.range_setting == "mse_with_act_loss":
@@ -260,63 +355,40 @@ def gen_eval_wrapper(model_name, args):
 
         logging.info("Quantizing the model...")
         model = convert_pt2e(model)
-        logging.info("Quantization complete! Here is some sample generated text:")
-
-        calibrate(
-            inputs,
-            "Could you tell me about Facebook?",
-            model,
-            tokenizer=tokenizer,
-            ar_len=args.prefill_ar_len,
-            max_seq_len=args.max_seq_len,
-            kv_updater=None,
-            use_i64_token=use_i64_token,
-        )
-
-    model = WrappedLlamaModel(
-        model, atten_mask, args.use_kv_cache, args.max_seq_length, args.device
-    )
-
-    return GraphModuleEvalWrapper(
-        model=model,
+        # logging.info("Quantization complete! Here is some sample generated text:")
+
+        # graph_module_inference(
+        #     use_kv_cache=False,
+        #     get_example_inputs=lambda use_kv_cache=False: inputs,
+        #     module=model,
+        #     tokenizer=tokenizer,
+        #     ar_len=args.max_seq_len,
+        #     max_seq_len=args.max_seq_len,
+        #     kv_updater=args.kv_updater,
+        #     prompt="Can you tell me about Facebook?",
+        #     use_i64_token=use_i64_token,
+        #     event_name="convert_pt2e_prompt",
+        # )
+
+    logging.info("Evaluation of QDQ model:")
+    graph_module_inference(
+        use_kv_cache=False,
+        get_example_inputs=lambda use_kv_cache=False: inputs,
+        module=model,
         tokenizer=tokenizer,
-        max_seq_length=args.calibration_seq_length,
-        use_kv_cache=args.use_kv_cache,
-        generate_full_logits=args.generate_full_logits,
-        enable_dynamic_shape=False,
+        ar_len=args.max_seq_len,
+        max_seq_len=args.max_seq_len,
+        kv_updater=args.kv_updater,
+        tasks=["wikitext"],
+        tasks_limit=0.1,
+        use_i64_token=use_i64_token,
+        event_name="convert_pt2e_prompt",
     )
 
 
-def eval_llama(
-    model_name: str,
-    args: argparse.Namespace,
-) -> None:
-    # Generate the eval wrapper
-    eval_wrapper = gen_eval_wrapper(model_name, args)
-
-    # Needed for loading mmlu dataset.
-    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1998/files
-    if args.tasks and "mmlu" in args.tasks:
-        import datasets
-
-        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
-    # Evaluate the model
-    with torch.no_grad():
-        eval_results = simple_evaluate(
-            model=eval_wrapper,
-            tasks=args.tasks,
-            num_fewshot=args.num_fewshot,
-            limit=args.fraction,
-        )
-
-    for task, res in eval_results["results"].items():
-        print(f"{task}: {res}")
-
-
 def main() -> None:
     seed = 42
     torch.manual_seed(seed)
-    modelname = "llama2"
     parser = build_args_parser()
     parser.add_argument(
         "-P",
@@ -344,9 +416,40 @@ def main() -> None:
         help="if you select this option we quantize linear layers only",
         action="store_true",
     )
+    parser.add_argument(
+        "--kv_updater",
+        help="Choose how to update kv cache during runtime",
+        choices=["smart_mask", "shift_pointer"],
+        default="smart_mask",
+        type=str,
+    )
+    parser.add_argument(
+        "--decoder_model",
+        help=f"The Llama model to export. Current available options are: {SUPPORTED_LLM_MODELS.keys()}",
+        required=True,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts and output by this example. Default ./llama_qnn",
+        default="./eval_llama_qnn",
+        type=str,
+    )
+    parser.add_argument(
+        "--r3",
+        help="Enable SpinQuant R3 quantization optimization. Please notice enable R3 could possibly cause performance drop.",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--tokenizer_model",
+        help="Pass llama tokenizer model.",
+        type=str,
+        default=None,
+    )
 
     args = parser.parse_args()
-    args.llama_model = "llama3_2"
+
     # Overrides this arg, because evaluation requires full logits.
     args.generate_full_logits = True
 
@@ -357,10 +460,10 @@ def main() -> None:
     args.use_kv_cache = False
     args.prefill_ar_len = args.max_seq_length
 
-    args.device = "cuda" if torch.cuda.is_available() else "cpu"
+    args.device = "cuda:0" if torch.cuda.is_available() else "cpu"
     torch.set_default_device(args.device)
 
-    eval_llama(modelname, args)
+    eval_llm(args)
 
 
 if __name__ == "__main__":
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 273829d214e..887e680341f 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -223,6 +223,7 @@ def quantize(
         custom_annotations=(),
         scales_state_dict=None,
         chat_template=None,
+        lookahead_config=None,
     ):
         self.quant_dtype = quant_dtype
         quantizer = make_custom_quantizer(
@@ -290,6 +291,7 @@ def quantize(
             prompt=prompt,
             use_i64_token=args.embedding_quantize is not None,
             event_name="prepare_pt2e_prompt",
+            lookahead_config=lookahead_config,
         )
         if scales_state_dict:
             set_scales(
@@ -325,6 +327,13 @@ def quantize(
                     chat_template, args.prompt[0], args.system_prompt
                 )
             )
+
+            # Gemma may produce unexpected output if the prompt contains an extra <bos> token.
+            # This can happen after applying a prompt template, which might inject <bos> unintentionally.
+            # To prevent decoding issues, we explicitly remove <bos> token
+            if chat_template and args.decoder_model in {"gemma-2b", "gemma3-1b"}:
+                prompt = prompt.replace("<bos>", "")
+
             graph_module_inference(
                 use_kv_cache=self.llama_meta["get_use_kv_cache"],
                 get_example_inputs=self.get_example_inputs,
@@ -336,6 +345,7 @@ def quantize(
                 prompt=prompt,
                 use_i64_token=args.embedding_quantize is not None,
                 event_name="convert_pt2e_prompt",
+                lookahead_config=lookahead_config,
             )
 
     def save_logits_quant_attrs(self):
@@ -497,13 +507,6 @@ def compile(
                 )
             )
         elif args.model_mode == "lookahead":
-            # TODO: Lookahead decoding is not yet supported for gemma3-1b.
-            # This will be implemented once the model architecture and KV update logic are adapted.
-            if args.decoder_model == "gemma3-1b":
-                raise NotImplementedError(
-                    "gemma3-1b does not currently support lookahead decoding."
-                )
-
             llama_instance_list.append(
                 LLM_VARIANT_ARCHS.get(args.decoder_model, LlamaModel)(
                     kv_config,
@@ -538,14 +541,13 @@ def compile(
         state_dict = torch.load(
             checkpoint, weights_only=True, map_location="cpu", mmap=True
         )
-        if args.decoder_model == "gemma3-1b":
+        if args.decoder_model in {"gemma-2b", "gemma3-1b"}:
             for k, v in state_dict.items():
                 if "norm" not in k:
                     continue
                 # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
                 # See https://github.com/huggingface/transformers/pull/29402
                 state_dict[k] = v.float() + torch.ones(v.shape, dtype=torch.float32)
-
     else:
         state_dict = torch.load(
             args.checkpoint, weights_only=True, map_location="cpu", mmap=True
@@ -697,6 +699,11 @@ def permute(w, heads):
         custom_annotations = decoder_model_config.custom_annotation
         kv_quant_attrs = {}
         for i, llama_instance in enumerate(llama_instance_list):
+            lookahead_config = (
+                (args.window, args.ngram, args.gcap)
+                if i == 0 and args.model_mode == "lookahead"
+                else None
+            )
             llama_instance.quantize(
                 quant_dtype=quant_dtype,
                 args=args,
@@ -704,6 +711,7 @@ def permute(w, heads):
                 custom_annotations=custom_annotations,
                 scales_state_dict=scales_state_dict,
                 chat_template=chat_template,
+                lookahead_config=lookahead_config,
             )
             # If hybrid and lookahead mode, we store kv output quant_attrs and apply to prefill output quant_attrs later
             if i == 0 and args.model_mode in ["hybrid", "lookahead"]:
@@ -1284,7 +1292,11 @@ def export_llama(args) -> None:
         )
         tokenizer_artifacts = tokenizer.save_pretrained(args.artifact)
         tokenizer_config = tokenizer_artifacts[0]
-        runtime_tokenizer_path = tokenizer_artifacts[-1]
+        if args.decoder_model == "gemma-2b":
+            # For Gemma, use tokenizer.model as it doesn't provide pre_tokenizer in tokenizer.json.
+            runtime_tokenizer_path = tokenizer_artifacts[-3]
+        else:
+            runtime_tokenizer_path = tokenizer_artifacts[-1]
         tokenizer = get_tokenizer(runtime_tokenizer_path, tokenizer_config)
 
     # TODO: Remove this once error is resolved.
diff --git a/examples/qualcomm/oss_scripts/llama/masking_utils.py b/examples/qualcomm/oss_scripts/llama/masking_utils.py
index 8d9d9ead154..0031f468802 100644
--- a/examples/qualcomm/oss_scripts/llama/masking_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/masking_utils.py
@@ -93,24 +93,26 @@ def mask(self) -> torch.Tensor:
         pass
 
     @abstractmethod
-    def smart_mask_update(self, pos, n_updates):
+    def smart_mask_update(self, pos, n_updates, lade_pos_offset):
         """
         Update the attention mask by smart mask update method after model forward.
 
         Args:
             pos (int): Current position in the sequence.
             n_updates (int): Number of new tokens to update.
+            lade_pos_offset (List[int]): Position offset of lookahead attention mask.
         """
         pass
 
     @abstractmethod
-    def shift_pointer_update(self, pos, n_updates):
+    def shift_pointer_update(self, pos, n_updates, lade_pos_offset):
         """
         Update the attention mask by shift pointer update method after model forward.
 
         Args:
             pos (int): Current position in the sequence.
             n_updates (int): Number of tokens to shift.
+            lade_pos_offset (List[int]): Position offset of lookahead attention mask.
         """
         pass
 
@@ -124,7 +126,7 @@ def __init__(self, max_batch_size: int, ar_len: int, max_seq_len: int):
     def mask(self):
         return self._mask
 
-    def smart_mask_update(self, pos, n_updates):
+    def smart_mask_update(self, pos, n_updates, _):
         """
         Smart Mask mechanism for attention mask updating
 
@@ -159,7 +161,7 @@ def smart_mask_update(self, pos, n_updates):
         end_pos = pos + n_updates
         self.mask[:, :, start_pos:end_pos] = 0
 
-    def shift_pointer_update(self, pos, n_updates):
+    def shift_pointer_update(self, pos, n_updates, _):
         """
         Shift Pointer mechanism for attention mask updating
 
@@ -173,7 +175,7 @@ def shift_pointer_update(self, pos, n_updates):
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ● ●
 
-        After 1st update (e.g., pos=0, n_updates=5, sliding_window=3):
+        After 1st update (e.g., pos=0, n_updates=5):
             Newly added tokens are unmasked (set to 0).
 
             0 ○ ○ ○ ○ ○ ● ● ● ● ● ● ○ ○ ○ ○
@@ -213,7 +215,7 @@ def __init__(
     def mask(self):
         return self._mask
 
-    def smart_mask_update(self, pos, n_updates):
+    def smart_mask_update(self, pos, n_updates, lade_pos_offset):
         """
         Smart Mask mechanism for attention mask updating
 
@@ -237,7 +239,8 @@ def smart_mask_update(self, pos, n_updates):
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ●
 
-        After 2nd update (e.g., pos=5, n_updates=5):
+
+        After 2nd update (e.g., pos=5, n_updates=5, sliding_window=3):
             Sliding window shifts again, masking older positions and activate new postion.
 
             0 ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ ○
@@ -252,16 +255,18 @@ def smart_mask_update(self, pos, n_updates):
         self.mask[:, :, start_pos:end_pos] = 0
 
         for i in range(self.ar_len):
-            # Calculate how many cached tokens are still avalible for this row
-            avalible_cache_len = self.sliding_window - (i + 1)
+            # Calculate how many cached tokens are still available for this row
+            available_cache_len = self.sliding_window - (
+                (i + 1) if lade_pos_offset is None else (lade_pos_offset[i] + 1)
+            )
 
             # If the current position exceeds available cache, mask the overflow
-            if end_pos > avalible_cache_len:
+            if end_pos > available_cache_len:
                 # Mask tokens that are no longer within the sliding window
                 # TODO: [Optional]: it can be optimized by computing the exact start index
-                self.mask[:, i, : end_pos - avalible_cache_len] = -255.0
+                self.mask[:, i, : end_pos - available_cache_len] = -255.0
 
-    def shift_pointer_update(self, pos, n_updates):
+    def shift_pointer_update(self, pos, n_updates, lade_pos_offset):
         """
         Shift Pointer mechanism for attention mask updating
 
@@ -283,7 +288,7 @@ def shift_pointer_update(self, pos, n_updates):
             3 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○
             4 ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ●
 
-         After 2nd update (e.g., pos=5, n_updates=5):
+        After 2nd update (e.g., pos=5, n_updates=5, sliding_window=3):
 
             0 ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○ ○
             1 ○ ○ ○ ○ ○ ○ ○ ○ ○ ● ● ● ○ ○ ○
@@ -297,14 +302,16 @@ def shift_pointer_update(self, pos, n_updates):
         self.mask[:, :, start_pos:end_pos] = 0
 
         for i in range(self.ar_len):
-            avalible_cache_len = self.sliding_window - (i + 1)
-            if abs(start_pos + self.ar_len) > avalible_cache_len:
+            available_cache_len = self.sliding_window - (
+                (i + 1) if lade_pos_offset is None else (lade_pos_offset[i] + 1)
+            )
+            if abs(start_pos + self.ar_len) > available_cache_len:
                 self.mask[
                     :,
                     i,
                     start_pos : start_pos
                     + abs(start_pos + self.ar_len)
-                    - avalible_cache_len,
+                    - available_cache_len,
                 ] = -255.0
 
 
@@ -312,13 +319,13 @@ class AttentionMask:
     def __init__(self, masks: Union[BaseAttentionMask, List[BaseAttentionMask]]):
         self.masks = masks if isinstance(masks, list) else [masks]
 
-    def smart_mask_update(self, pos, n_updates):
+    def smart_mask_update(self, pos, n_updates, lade_pos_offset=None):
         for mask in self.masks:
-            mask.smart_mask_update(pos, n_updates)
+            mask.smart_mask_update(pos, n_updates, lade_pos_offset)
 
-    def shift_pointer_update(self, pos, n_updates):
+    def shift_pointer_update(self, pos, n_updates, lade_pos_offset=None):
         for mask in self.masks:
-            mask.shift_pointer_update(pos, n_updates)
+            mask.shift_pointer_update(pos, n_updates, lade_pos_offset)
 
     def __iter__(self):
         return iter([mask.mask for mask in self.masks])
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
index 71eaea2b8d6..2bffb35852a 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -9,7 +9,7 @@
 /**
  * @file
  *
- * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma3 1B,
+ * This tool can run Llama2 110M, Llama3.2 1B / 3B, Gemma 2B, Gemma3 1B,
  * phi4-mini-instruct, Qwen2.5 0.5B / 1.5B, Qwen3 0.6B / 1.7B, SmolLM2 135M,
  * SmolLM3 3B with Qualcomm AI Engine Direct.
  *
@@ -117,6 +117,7 @@ std::string get_formatted_prompt(
       formatted_prompt.append(
           "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
       break;
+    case example::DecoderModelVersion::kGemma:
     case example::DecoderModelVersion::kGemma3:
       formatted_prompt.append("<start_of_turn>user\n");
       formatted_prompt.append(prompt);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
index a049b54abb6..c6e59097ffc 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp
@@ -122,7 +122,8 @@ void KVManager<T>::init_attention_mask(
     const std::vector<int32_t>& attention_map,
     int32_t ar_len,
     int32_t n_past,
-    int32_t sliding_window) {
+    int32_t sliding_window,
+    const std::vector<int32_t>& position_offset) {
   ET_CHECK_MSG(
       attention_map.size() <= ar_len,
       "The size of attention_map (%zu) doesn't match with ar_len (%d)",
@@ -154,11 +155,12 @@ void KVManager<T>::init_attention_mask(
         }
         // Attend to itself
         new_ptr[i] = pos_val;
-
         // mask by limitation of sliding_window
-        int32_t avalible_context_len = sliding_window - (i + 1) - n_past;
-        if (n_past > avalible_context_len) {
-          std::fill_n(past_ptr, n_past - avalible_context_len, neg_val);
+        int32_t available_context_len = position_offset.empty()
+            ? sliding_window - (i + 1) - n_past
+            : sliding_window - (position_offset[i] + 1) - n_past;
+        if (n_past > available_context_len) {
+          std::fill_n(past_ptr, n_past - available_context_len, neg_val);
         }
 
         past_ptr += metadata_.context_len;
@@ -219,7 +221,8 @@ void KVManager<T>::update_attention_mask(
     int32_t ar_len,
     int32_t n_past,
     int32_t n_update,
-    int32_t sliding_window) {
+    int32_t sliding_window,
+    const std::vector<int32_t>& position_offset) {
   uint16_t pos_val = 65535;
   uint16_t neg_val = 0;
   uint16_t* cur_ptr = attention_mask;
@@ -230,21 +233,22 @@ void KVManager<T>::update_attention_mask(
 
   for (int i = 0; i < ar_len; i++) {
     std::fill_n(cur_ptr, n_update, pos_val);
-    int32_t avalible_cache_len = sliding_window - (i + 1);
+    int32_t available_cache_len = position_offset.empty()
+        ? sliding_window - (i + 1)
+        : sliding_window - (position_offset[i] + 1);
     if (kv_updater_ == KVManagerMode::SMART_MASK) {
-      if (n_past + n_update > avalible_cache_len) {
+      if (n_past + n_update > available_cache_len) {
         std::fill_n(
-            cur_ptr - n_past, n_past + n_update - avalible_cache_len, neg_val);
+            cur_ptr - n_past, n_past + n_update - available_cache_len, neg_val);
       }
     } else if (kv_updater_ == KVManagerMode::SHIFT_POINTER) {
-      if (std::abs(n_past + ar_len) > avalible_cache_len) {
-        int32_t n_invalid = n_past - avalible_cache_len;
+      if (std::abs(n_past + ar_len) > available_cache_len) {
+        int32_t n_invalid = n_past - available_cache_len;
         std::fill_n(
-            cur_ptr, std::abs(n_past + ar_len) - avalible_cache_len, neg_val);
+            cur_ptr, std::abs(n_past + ar_len) - available_cache_len, neg_val);
       }
-
-      cur_ptr += metadata_.context_len;
     }
+    cur_ptr += metadata_.context_len;
   }
 }
 
@@ -361,7 +365,8 @@ void KVManager<T>::rearrange_key(KVCache<T>& k_cache, int32_t ar_len_dst) {
     }
     // copy from first dimension
     for (int i = 0; i < metadata_.head_dim; i++) {
-      std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num);
+      std::memmove(
+          k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num * sizeof(T));
       k_cache_in_read_ptr += src_cache_num;
       k_cache_in_write_ptr += dst_cache_num;
     }
@@ -374,7 +379,8 @@ void KVManager<T>::rearrange_key(KVCache<T>& k_cache, int32_t ar_len_dst) {
     }
     // copy from last dimension
     for (int i = 0; i < metadata_.head_dim; i++) {
-      std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num);
+      std::memmove(
+          k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num * sizeof(T));
       k_cache_in_read_ptr -= src_cache_num;
       k_cache_in_write_ptr -= dst_cache_num;
     }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
index af9cf49a34f..ca24166aa9c 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h
@@ -95,13 +95,17 @@ class KVManager {
    * of attention map should be [ar_len].
    * @param ar_len Length of input tokens.
    * @param n_past Number of past elements in the cache.
+   * @param sliding_window Length of sliding window for sliding window attention
+   * mask
+   * @param position_offset (optional) attention mask position offset of
    */
   void init_attention_mask(
       uint16_t* attention_mask,
       const std::vector<int32_t>& attention_map,
       int32_t ar_len,
       int32_t n_past,
-      int32_t sliding_window);
+      int32_t sliding_window,
+      const std::vector<int32_t>& position_offset = {});
 
   /**
    * @brief Update attention mask based on kv manager mode, and n_update.
@@ -126,13 +130,16 @@ class KVManager {
    * @param n_update Number of elements to be updated.
    * @param sliding_window Length of sliding window for sliding window attention
    * mask
+   * @param position_offset (optional) attention mask position offset of
+   * lookahead decoder
    */
   void update_attention_mask(
       uint16_t* attention_mask,
       int32_t ar_len,
       int32_t n_past,
       int32_t n_update,
-      int32_t sliding_window);
+      int32_t sliding_window,
+      const std::vector<int32_t>& position_offset = {});
 
   /**
    * @brief Reset the data pointer of the I/O cache tensor based on number of
diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
index 1692caa2756..96a25e9c935 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp
@@ -61,6 +61,16 @@ void LhdTokenGenerator<T>::init_attention_mask(int32_t n_past) {
 
   this->kv_manager_->init_attention_mask(
       this->attention_mask_.data, attention_map, metadata_.ar_len, n_past);
+  // Initialize window attention mask with current position
+  if (metadata_.cache_mode == CacheMode::HybridCache) {
+    this->kv_manager_->init_attention_mask(
+        this->window_attention_mask_.data,
+        attention_map,
+        metadata_.ar_len,
+        n_past,
+        metadata_.sliding_window,
+        position_offset_);
+  }
 }
 
 template <typename T>
@@ -378,6 +388,15 @@ Result<int64_t> LhdTokenGenerator<T>::generate(
     // Update attention mask with current position
     this->kv_manager_->update_attention_mask(
         this->attention_mask_.data, metadata_.ar_len, prev_pos, n_update);
+    if (metadata_.cache_mode == CacheMode::HybridCache) {
+      this->kv_manager_->update_attention_mask(
+          this->window_attention_mask_.data,
+          metadata_.ar_len,
+          prev_pos,
+          n_update,
+          metadata_.sliding_window,
+          position_offset_);
+    }
 
     // data-dependent terminating condition: we have n_eos_ number of EOS
     if (this->eos_ids_->count(cur_token) > 0) {
diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
index fe5e4b49230..cf4c55d9f2c 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
@@ -29,6 +29,7 @@ class LhdTokenGenerator : public TokenGenerator<T> {
     int32_t window;
     int32_t gcap;
     int sliding_window;
+    CacheMode cache_mode;
   };
   LhdTokenGenerator(
       tokenizers::Tokenizer* tokenizer,
@@ -51,7 +52,8 @@ class LhdTokenGenerator : public TokenGenerator<T> {
                 metadata.ar_len,
                 metadata.vocab_size,
                 metadata.use_int64_token,
-                metadata.sliding_window},
+                metadata.sliding_window,
+                metadata.cache_mode},
             stats),
         metadata_(metadata),
         lhd_branch_(metadata.ngram - 1, std::vector<int32_t>(metadata.window)),
@@ -63,6 +65,22 @@ class LhdTokenGenerator : public TokenGenerator<T> {
         metadata.ngram,
         metadata.window,
         metadata.gcap);
+
+    // initialize position offset
+    position_offset_ = std::vector<int32_t>(metadata.ar_len);
+    int idx = 0;
+    // lookahead branches
+    for (int i = 0; i < metadata.ngram - 1; ++i) {
+      for (int j = 0; j < metadata.window; ++j) {
+        position_offset_[idx++] = i + j;
+      }
+    }
+    // verification branches
+    for (int i = 0; i < metadata.gcap; ++i) {
+      for (int j = 1; j < metadata.ngram; ++j) {
+        position_offset_[idx++] = j;
+      }
+    }
   }
 
   ~LhdTokenGenerator() = default;
@@ -136,6 +154,9 @@ class LhdTokenGenerator : public TokenGenerator<T> {
   // verification branch
   std::vector<NgramData> v_branch_;
 
+  // position offset in attention mask
+  std::vector<int32_t> position_offset_;
+
   // n-gram pools
   NgramContainer ngrams_pool_;
 };
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 253e083a80e..0c4884bbccf 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -122,6 +122,8 @@ Runner<T>::Runner(
     decoder_model_version_ = DecoderModelVersion::kLlama2;
   } else if (decoder_model_version == "llama3") {
     decoder_model_version_ = DecoderModelVersion::kLlama3;
+  } else if (decoder_model_version == "gemma") {
+    decoder_model_version_ = DecoderModelVersion::kGemma;
   } else if (decoder_model_version == "gemma3") {
     decoder_model_version_ = DecoderModelVersion::kGemma3;
     cache_mode_ = CacheMode::HybridCache;
@@ -182,8 +184,7 @@ Error Runner<T>::load() {
     eos_ids->insert(tokenizer_->encode("<|eot|>", 0, 0).get()[0]);
     eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]);
   } else {
-    tokenizer_ =
-        example::load_llama_tokenizer(tokenizer_path_, Version::Default);
+    tokenizer_ = llm::load_tokenizer(tokenizer_path_);
     if (tokenizer_ == nullptr) {
       ET_LOG(
           Error, "Failed to load tokenizer with %s", tokenizer_path_.c_str());
@@ -200,7 +201,9 @@ Error Runner<T>::load() {
       decoder_model_version_ == DecoderModelVersion::kSmollm2_135m ||
       decoder_model_version_ == DecoderModelVersion::kSmollm3) {
     eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
-  } else if (decoder_model_version_ == DecoderModelVersion::kGemma3) {
+  } else if (
+      decoder_model_version_ == DecoderModelVersion::kGemma ||
+      decoder_model_version_ == DecoderModelVersion::kGemma3) {
     eos_ids->insert(tokenizer_->encode("<end_of_turn>", 0, 0).get()[0]);
   }
 
@@ -286,12 +289,6 @@ Error Runner<T>::load() {
           sliding_window,
           cache_mode_});
   if (eval_mode_ == EvalMode::kLookaheadDecoding) {
-    // TODO: sliding window attention will be supported in future.
-    if (sliding_window < context_len_) {
-      ET_CHECK_MSG(
-          false,
-          "Lookahead decoding (eval_mode == 2) is not yet supported for sliding window attention.");
-    }
     token_generator_ = std::make_unique<LhdTokenGenerator<T>>(
         tokenizer_.get(),
         decoder_runner_.get(),
@@ -308,7 +305,8 @@ Error Runner<T>::load() {
             ngram_,
             window_,
             gcap_,
-            sliding_window},
+            sliding_window,
+            cache_mode_},
         &stats_);
   } else {
     token_generator_ = std::make_unique<TokenGenerator<T>>(
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
index 9f290d79c75..1472093ab66 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -32,6 +32,7 @@ namespace example {
 enum DecoderModelVersion {
   kLlama2 = 0,
   kLlama3,
+  kGemma,
   kGemma3,
   kPhi4,
   kQwen2_5,
diff --git a/examples/qualcomm/oss_scripts/maxvit_t.py b/examples/qualcomm/oss_scripts/maxvit_t.py
new file mode 100755
index 00000000000..7a53edd715b
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/maxvit_t.py
@@ -0,0 +1,244 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import json
+import logging
+import os
+
+from multiprocessing.connection import Client
+
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+import torchvision
+
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    make_quantizer,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+from torchvision.models.maxvit import (
+    PartitionAttentionLayer,
+    RelativePositionalMultiHeadAttention,
+)
+
+
+class WindowPartition(torch.nn.Module):
+    """
+    Partition the input tensor into non-overlapping windows.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor, p: int) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, C, H, W].
+            p (int): Number of partitions.
+        Returns:
+            Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C].
+        """
+        B, C, H, W = x.shape
+        P = p
+        # chunk up H and W dimensions
+        x = x.reshape(B * C, H // P, P, W // P, P)
+        x = x.permute(0, 1, 3, 2, 4)
+        # colapse P * P dimension
+        x = x.reshape(B, C, (H // P) * (W // P), P * P)
+        return x.permute(0, 2, 3, 1)
+
+
+class WindowDepartition(torch.nn.Module):
+    """
+    Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W].
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, x: torch.Tensor, p: int, h_partitions: int, w_partitions: int
+    ) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C].
+            p (int): Number of partitions.
+            h_partitions (int): Number of vertical partitions.
+            w_partitions (int): Number of horizontal partitions.
+        Returns:
+            Tensor: Output tensor with expected layout of [B, C, H, W].
+        """
+        B, G, PP, C = x.shape
+        P = p
+        HP, WP = h_partitions, w_partitions
+        x = x.permute(0, 3, 1, 2)
+        # split P * P dimension into 2 P tile dimensionsa
+        x = x.reshape(B * C, HP, WP, P, P)
+        # permute into B * C, HP, P, WP, P
+        x = x.permute(0, 1, 3, 2, 4)
+        # reshape into B, C, H, W
+        x = x.reshape(B, C, HP * P, WP * P)
+        return x
+
+
+def forward(self, x: torch.Tensor) -> torch.Tensor:
+    """
+    Args:
+        x (Tensor): Input tensor with expected layout of [B, G, P, D].
+    Returns:
+        Tensor: Output tensor with expected layout of [B, G, P, D].
+    """
+    B, G, P, D = x.shape
+    H, DH = self.n_heads, self.head_dim
+
+    qkv = self.to_qkv(x)
+    q, k, v = torch.chunk(qkv, 3, dim=-1)
+
+    q = q.reshape(B * G, P, H, DH).permute(0, 2, 1, 3)
+    k = k.reshape(B * G, P, H, DH).permute(0, 2, 1, 3)
+    v = v.reshape(B * G, P, H, DH).permute(0, 2, 1, 3)
+
+    k = k * self.scale_factor
+    dot_prod = torch.einsum("B H I D, B H J D -> B H I J", q, k)
+    pos_bias = self.get_relative_positional_bias()
+
+    dot_prod = F.softmax(dot_prod + pos_bias, dim=-1)
+
+    out = torch.einsum("B H I J, B H J D -> B H I D", dot_prod, v)
+    out = out.permute(0, 2, 1, 3).reshape(B, G, P, D)
+
+    out = self.merge(out)
+    return out
+
+
+def main(args):
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    if args.ci:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets = get_imagenet_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
+        )
+
+    pte_filename = "maxvit_t_qnn_q8"
+    instance = torchvision.models.maxvit_t(weights="IMAGENET1K_V1").eval()
+    for block in instance.blocks:
+        for layer in block.layers:
+            for sub_layer in layer.layers:
+                if isinstance(sub_layer, PartitionAttentionLayer):
+                    sub_layer.partition_op = WindowPartition()
+                    sub_layer.departition_op = WindowDepartition()
+                    for attn_sub_layer in sub_layer.attn_layer:
+                        if isinstance(
+                            attn_sub_layer, RelativePositionalMultiHeadAttention
+                        ):
+                            attn_sub_layer.forward = functools.partial(
+                                forward, attn_sub_layer
+                            )
+
+    build_executorch_binary(
+        instance,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        custom_quantizer=make_quantizer(
+            quant_dtype=QuantDtype.use_8a8w,
+            per_channel_linear=True,
+        ),
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=inputs)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./maxvit_t",
+        default="./maxvit_t",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    args.validate(args)
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/swin_v2_t.py b/examples/qualcomm/oss_scripts/swin_v2_t.py
new file mode 100755
index 00000000000..954c27f428f
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/swin_v2_t.py
@@ -0,0 +1,185 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+
+from multiprocessing.connection import Client
+
+import numpy as np
+
+import torch
+import torchvision
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    FoldQDQ,
+    get_capture_program_passes,
+    get_passes_dependency_for_capture_program,
+    QCOM_PASS_ACTIVATE_KEY,
+    QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY,
+)
+
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    make_quantizer,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class RewritePartition(ExportPass):
+    """
+    Rewrite 6D window partition pattern to 5D one.
+    """
+
+    def __init__(self):
+        super(RewritePartition, self).__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        # math equivalent implementation
+        for node in graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == exir_ops.edge.aten.permute_copy.default
+                and node.args[1] == [0, 1, 3, 2, 4, 5]
+            ):
+                # adjust original view node to take 5D tensor
+                view_node = node.args[0]
+                b, n_window_h, window_h, n_window_w, window_w, c = view_node.args[1]
+                shape = [b, n_window_h, window_h, n_window_w, window_w * c]
+                view_node.args = (view_node.args[0], shape)
+                view_node.meta["val"] = view_node.meta["val"].reshape(shape)
+                # change current permute node accordingly
+                axis_order = [0, 1, 3, 2, 4]
+                node.args = (view_node, axis_order)
+                node.meta["val"] = view_node.meta["val"].permute(axis_order)
+
+        graph_module.recompile()
+        return PassResult(graph_module, True)
+
+
+def main(args):
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    if args.ci:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets = get_imagenet_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
+        )
+
+    pte_filename = "swin_v2_t_qnn_q8"
+    instance = torchvision.models.swin_v2_t(weights="IMAGENET1K_V1").eval()
+    passes_job = get_capture_program_passes()
+    passes_job[RewritePartition] = {
+        QCOM_PASS_ACTIVATE_KEY: True,
+        QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY: {},
+    }
+    passes_dep = get_passes_dependency_for_capture_program()
+    passes_dep[RewritePartition] = [FoldQDQ]
+    build_executorch_binary(
+        instance,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        custom_quantizer=make_quantizer(
+            quant_dtype=QuantDtype.use_8a8w,
+            per_channel_linear=True,
+        ),
+        shared_buffer=args.shared_buffer,
+        passes_job=passes_job,
+        passes_dependency=passes_dep,
+    )
+
+    if args.compile_only:
+        return
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=inputs)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./swin_v2_t",
+        default="./swin_v2_t",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    args.validate(args)
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/oss_scripts/vit_b_16.py b/examples/qualcomm/oss_scripts/vit_b_16.py
new file mode 100755
index 00000000000..6b79ecc7cda
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/vit_b_16.py
@@ -0,0 +1,135 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+
+from multiprocessing.connection import Client
+
+import numpy as np
+
+import torch
+import torchvision
+
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.examples.qualcomm.utils import (
+    build_executorch_binary,
+    get_imagenet_dataset,
+    make_output_dir,
+    make_quantizer,
+    setup_common_args_and_variables,
+    SimpleADB,
+    topk_accuracy,
+)
+
+
+def main(args):
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    data_num = 100
+    if args.ci:
+        inputs = [(torch.rand(1, 3, 224, 224),)]
+        logging.warning(
+            "This option is for CI to verify the export flow. It uses random input and will result in poor accuracy."
+        )
+    else:
+        inputs, targets = get_imagenet_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=data_num,
+            image_shape=(256, 256),
+            crop_size=224,
+        )
+
+    pte_filename = "vit_b_16_qnn_q8"
+    instance = torchvision.models.vit_b_16(weights="IMAGENET1K_V1").eval()
+    build_executorch_binary(
+        instance,
+        inputs[0],
+        args.model,
+        f"{args.artifact}/{pte_filename}",
+        inputs,
+        custom_quantizer=make_quantizer(
+            quant_dtype=QuantDtype.use_8a8w,
+            per_channel_linear=True,
+        ),
+        shared_buffer=args.shared_buffer,
+    )
+
+    if args.compile_only:
+        return
+
+    adb = SimpleADB(
+        qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+        build_path=f"{args.build_folder}",
+        pte_path=f"{args.artifact}/{pte_filename}.pte",
+        workspace=f"/data/local/tmp/executorch/{pte_filename}",
+        device_id=args.device,
+        host_id=args.host,
+        soc_model=args.model,
+        shared_buffer=args.shared_buffer,
+    )
+    adb.push(inputs=inputs)
+    adb.execute()
+
+    # collect output data
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    adb.pull(output_path=args.artifact)
+
+    # top-k analysis
+    predictions = []
+    for i in range(data_num):
+        predictions.append(
+            np.fromfile(
+                os.path.join(output_data_folder, f"output_{i}_0.raw"), dtype=np.float32
+            )
+        )
+
+    k_val = [1, 5]
+    topk = [topk_accuracy(predictions, targets, k).item() for k in k_val]
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(json.dumps({f"top_{k}": topk[i] for i, k in enumerate(k_val)}))
+    else:
+        for i, k in enumerate(k_val):
+            print(f"top_{k}->{topk[i]}%")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. "
+        "Default ./vit_b_16",
+        default="./vit_b_16",
+        type=str,
+    )
+
+    args = parser.parse_args()
+    args.validate(args)
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index e43821bda64..036c5060b12 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -384,6 +384,7 @@ def build_executorch_binary(
     metadata=None,
     dump_intermediate_outputs=False,
     passes_job=None,
+    passes_dependency=None,
     qat_training_data=None,
     online_prepare=False,
     optrace=False,
@@ -406,6 +407,7 @@ def build_executorch_binary(
         metadata (dict, optional): An optional dictionary that maps each method name to a constant value in eager mode.
         dump_intermediate_outputs (bool, optional): Enables dumping model intermediate outputs.
         passes_job (OrderedDict, optional): Custom passes job in capture_program, users can enable/disable specific passes or modify their attributes.
+        passes_dependency (Dict, optional): A dictionary mapping each pass to its corresponding list of dependencies.
         qat_training_data (List[torch.Tensor], optional): A dataset for quantization aware training(QAT). Typically is a pair of tensors, such as [features, ground truth].
         online_prepare (bool, optional): Compose QNN graph on device if set to True.
         optrace (bool, optional): Enable optrace mode for performance analysis if set to True.
@@ -449,6 +451,7 @@ def build_executorch_binary(
             compile_spec,
             constant_methods=metadata,
             passes_job=passes_job,
+            dep_table=passes_dependency,
             skip_node_id_set=skip_node_id_set,
             skip_node_op_set=skip_node_op_set,
         )
@@ -915,24 +918,34 @@ def generate_inputs(dest_path: str, file_name: str, inputs=None):
     input_list_file = None
     input_files = []
 
+    def prepare_input_file(tensor, fd, index, sub_index):
+        # transform torch.Tensor to raw file
+        input_file_name = f"input_{index}_{sub_index}.raw"
+        input_file_path = f"{dest_path}/{input_file_name}"
+        if not isinstance(tensor, torch.Tensor):
+            tensor = torch.tensor(tensor)
+        tensor.detach().numpy().tofile(input_file_path)
+        input_files.append(input_file_path)
+        # prepare input_list
+        if sub_index > 0:
+            fd.write(" ")
+        fd.write(input_file_name)
+
     # Prepare input data
     if inputs is not None:
         input_list_file = f"{dest_path}/{file_name}"
         with open(input_list_file, "w") as f:
             for idx, data in enumerate(inputs):
-                for i, d in enumerate(data):
-                    # transform torch.Tensor to raw file
-                    file_name = f"input_{idx}_{i}.raw"
-                    file_path = f"{dest_path}/{file_name}"
-                    if not isinstance(d, torch.Tensor):
-                        d = torch.tensor(d)
-                    d.detach().numpy().tofile(file_path)
-                    input_files.append(file_path)
-
-                    # prepare input_list
-                    if i > 0:
-                        f.write(" ")
-                    f.write(file_name)
+                sub_index = 0
+                for d in data:
+                    if isinstance(d, (list, tuple)):
+                        for sub_d in d:
+                            prepare_input_file(sub_d, f, idx, sub_index)
+                            sub_index += 1
+                    else:
+                        prepare_input_file(d, f, idx, sub_index)
+                        sub_index += 1
+
                 f.write("\n")
 
     return input_list_file, input_files
diff --git a/examples/samsung/scripts/deeplab_v3.py b/examples/samsung/scripts/deeplab_v3.py
new file mode 100644
index 00000000000..b1e8fef65fe
--- /dev/null
+++ b/examples/samsung/scripts/deeplab_v3.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import Optional
+
+import torch
+import torchvision.transforms.v2 as vision_transform_v2
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.deeplab_v3 import DeepLabV3ResNet50Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+from torchvision.datasets import VOCSegmentation
+
+
+def get_dataset(
+    data_dir: str,
+    calinum=100,
+    input_transform_compose: Optional[vision_transform_v2.Compose] = None,
+    target_transform_compose: Optional[vision_transform_v2.Compose] = None,
+):
+    if not input_transform_compose:
+        input_transform_compose = vision_transform_v2.Compose(
+            [
+                vision_transform_v2.Resize([224, 224]),
+                vision_transform_v2.ToImage(),
+                vision_transform_v2.ToDtype(torch.float32, scale=True),
+                vision_transform_v2.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+                vision_transform_v2.Lambda(lambda x: x.unsqueeze(0)),  # Add batch dim
+            ]
+        )
+    if not target_transform_compose:
+        target_transform_compose = vision_transform_v2.Compose(
+            [
+                vision_transform_v2.Resize([224, 224]),
+                vision_transform_v2.ToImage(),
+                vision_transform_v2.ToDtype(torch.long, scale=False),
+                vision_transform_v2.Lambda(lambda x: x.unsqueeze(0)),  # Add batch dim
+            ]
+        )
+    voc_dataset = VOCSegmentation(
+        data_dir,
+        "2012",
+        "val",
+        transform=input_transform_compose,
+        target_transform=target_transform_compose,
+    )
+    example_input = [
+        (voc_dataset[i][0],) for i in range(min(calinum, len(voc_dataset)))
+    ]
+    return example_input
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=("path to the validation folder of VOC dataset. "),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./deeplab_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "deeplab_v3"
+    instance = DeepLabV3ResNet50Model()
+    model = DeepLabV3ResNet50Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs = get_dataset(
+            data_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/edsr.py b/examples/samsung/scripts/edsr.py
new file mode 100644
index 00000000000..f300a9c8547
--- /dev/null
+++ b/examples/samsung/scripts/edsr.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List, Optional, Tuple
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.edsr import EdsrModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+from torchsr import transforms
+
+
+def get_dataset(
+    root_dir: str,
+    calinum=100,
+    transform_compose: Optional[transforms.Compose] = None,
+) -> Tuple:
+    """
+    Generate test data from B100 dataset for quantization model
+
+    :param root_dir: Dir of dataset. The real dataset should be in root_dir/SRBenchmarks/benchmark/
+    :param dataset_name: data_set name
+    :param testnum: Number of test data. Default 500
+    :param transform_compose: Transforms to be applied to data.
+        Default:
+        transform_compose = transforms.Compose(
+            [transforms.ToTensor()] # Convert Pillows Image to tensor
+        )
+    :type root_dir: str
+    :type calinum: int
+    :type testnum: int
+    :type transform_compose: transforms.Compose | None
+    :return: (example_input, cali_data, test_data)
+    """
+
+    class SrResize:
+        def __init__(self, expected_size: List[List[int]]):
+            self.expected_size = expected_size
+
+        def __call__(self, x):
+            return (
+                x[0].resize(self.expected_size[0]),
+                x[1].resize(self.expected_size[1]),
+            )
+
+    class SrUnsqueeze:
+        def __call__(self, x):
+            return (
+                x[0].unsqueeze(0),
+                x[1].unsqueeze(0),
+            )
+
+    if not transform_compose:
+        transform_compose = transforms.Compose(
+            [
+                SrResize([[448, 448], [224, 224]]),
+                transforms.ToTensor(),  # Convert Pillows Image to tensor
+                SrUnsqueeze(),
+            ]
+        )
+    from torchsr.datasets import B100
+
+    dataset = B100(root=root_dir, transform=transform_compose, scale=2)
+    example_data = [(dataset[i][1],) for i in range(min(calinum, len(dataset)))]
+    return example_data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=("path to the validation folder of B100"),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./edsr",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "edsr"
+    instance = EdsrModel()
+    model = EdsrModel().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs = get_dataset(
+            root_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/inception_v3.py b/examples/samsung/scripts/inception_v3.py
new file mode 100644
index 00000000000..77540285eab
--- /dev/null
+++ b/examples/samsung/scripts/inception_v3.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.inception_v3 import InceptionV3Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./inception_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "inception_v3"
+    instance = InceptionV3Model()
+    model = InceptionV3Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/inception_v4.py b/examples/samsung/scripts/inception_v4.py
new file mode 100644
index 00000000000..3140682998c
--- /dev/null
+++ b/examples/samsung/scripts/inception_v4.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.inception_v4 import InceptionV4Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (299, 299)
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./inception_v4",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "inception_v4"
+    instance = InceptionV4Model()
+    model = InceptionV4Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/mobilenet_v2.py b/examples/samsung/scripts/mobilenet_v2.py
new file mode 100644
index 00000000000..7c69de38e2c
--- /dev/null
+++ b/examples/samsung/scripts/mobilenet_v2.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.mobilenet_v2 import MV2Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./mobilenetV2",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "mobilenetV2_enn"
+    instance = MV2Model(False)
+    model = MV2Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/mobilenet_v3.py b/examples/samsung/scripts/mobilenet_v3.py
new file mode 100644
index 00000000000..3cc8eadf633
--- /dev/null
+++ b/examples/samsung/scripts/mobilenet_v3.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.mobilenet_v3 import MV3Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./mobilenet_v3",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "mobilenet_v3"
+    instance = MV3Model()
+    model = MV3Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/resnet18.py b/examples/samsung/scripts/resnet18.py
new file mode 100644
index 00000000000..2f3233214ce
--- /dev/null
+++ b/examples/samsung/scripts/resnet18.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.resnet import ResNet18Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./resnet18",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "resnet18"
+    instance = ResNet18Model()
+    model = ResNet18Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/resnet50.py b/examples/samsung/scripts/resnet50.py
new file mode 100644
index 00000000000..1d6c348b641
--- /dev/null
+++ b/examples/samsung/scripts/resnet50.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.resnet import ResNet50Model
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./resnet50",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "resnet50"
+    instance = ResNet50Model()
+    model = ResNet50Model().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/vit.py b/examples/samsung/scripts/vit.py
new file mode 100644
index 00000000000..19c22c473cd
--- /dev/null
+++ b/examples/samsung/scripts/vit.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.torchvision_vit import TorchVisionViTModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+def get_dataset(dataset_path, data_size):
+    from torchvision import datasets, transforms
+
+    image_shape = (256, 256)
+    crop_size = 224
+    shuffle = True
+
+    def get_data_loader():
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(image_shape),
+                transforms.CenterCrop(crop_size),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        imagenet_data = datasets.ImageFolder(dataset_path, transform=preprocess)
+        return torch.utils.data.DataLoader(
+            imagenet_data,
+            shuffle=shuffle,
+        )
+
+    # prepare input data
+    inputs, targets, input_list = [], [], ""
+    data_loader = get_data_loader()
+    for index, data in enumerate(data_loader):
+        if index >= data_size:
+            break
+        feature, target = data
+        inputs.append((feature,))
+        targets.append(target)
+        input_list += f"input_{index}_0.bin\n"
+
+    return inputs, targets, input_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./vision_transformer",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "vision_transformer"
+    instance = TorchVisionViTModel()
+    model = TorchVisionViTModel().get_eager_model().eval()
+    assert args.calibration_number
+    if args.dataset:
+        inputs, targets, input_list = get_dataset(
+            dataset_path=f"{args.dataset}",
+            data_size=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+        target = None
+        input_list = None
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/samsung/scripts/wav2letter.py b/examples/samsung/scripts/wav2letter.py
new file mode 100644
index 00000000000..33069105d99
--- /dev/null
+++ b/examples/samsung/scripts/wav2letter.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2025 Samsung Electronics Co. LTD
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+from typing import List
+
+import torch
+
+from executorch.backends.samsung.partition.enn_partitioner import EnnPartitioner
+from executorch.backends.samsung.quantizer import Precision
+from executorch.backends.samsung.serialization.compile_options import (
+    gen_samsung_backend_compile_spec,
+)
+from executorch.backends.samsung.utils.export_utils import (
+    quantize_module,
+    to_edge_transform_and_lower_to_enn,
+)
+from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.examples.samsung.utils import save_tensors
+from executorch.exir import ExecutorchBackendConfig
+from executorch.extension.export_util.utils import save_pte_program
+
+
+class DataManager:
+    class Encoder:
+        def __init__(self, vocab, blank_label="*"):
+            self.vocab = vocab
+            self.char_to_id = {c: i for i, c in enumerate(vocab)}
+            self.blank_label = blank_label
+
+        def encode(self, text):
+            return [self.char_to_id[c] for c in text.lower()]
+
+    @classmethod
+    def _get_voice_dataset(
+        cls, data_size: int, data_dir: str, labels: List[str], fixed_token_num: int
+    ):
+        from torch.utils.data import DataLoader
+        from torchaudio.datasets import LIBRISPEECH
+
+        def collate_fun(batch, encode_fn, mode="train"):
+            waves = []
+            text_ids = []
+            input_lengths = []
+            output_lengths = []
+
+            if mode == "train":
+                shifts = torch.randn(len(batch)) > 0.0
+
+            for i, (wave, _, text, *_) in enumerate(batch):
+                if mode == "train" and shifts[i]:
+                    wave = wave[:, 160:]
+                waves.append(wave[0])
+                ids = torch.LongTensor(encode_fn(text))
+                text_ids.append(ids)
+                input_lengths.append(wave.size(1) // 320)
+                output_lengths.append(len(ids))
+
+            waves = torch.nn.utils.rnn.pad_sequence(waves, batch_first=True).unsqueeze(
+                1
+            )
+            labels = torch.nn.utils.rnn.pad_sequence(text_ids, batch_first=True)
+
+            return waves, labels, input_lengths, output_lengths
+
+        encoder = cls.Encoder(labels)
+
+        testset_url = "test-clean"
+        dataset = LIBRISPEECH(data_dir, url=testset_url)
+        data_loader = DataLoader(
+            dataset=dataset,
+            batch_size=1,
+            shuffle=True,
+            collate_fn=lambda x: collate_fun(x, encoder.encode, "valid"),
+        )
+        # prepare input data
+        inputs, targets = [], []
+        in_lens, tar_lens = [], []
+
+        def _loader():
+            for waves, labels, inputs_len, targets_len in data_loader:
+                if inputs_len[0] >= fixed_token_num:
+                    continue
+                zero_padding = torch.zeros(
+                    [1, 1, fixed_token_num * 320 - waves.shape[2]]
+                )
+                waves = torch.concat((waves, zero_padding), axis=2)
+                yield waves, labels, [fixed_token_num + 1], targets_len
+
+        for i, (waves, labels, inputs_len, targets_len) in enumerate(
+            _loader()
+        ):  # waves, labels, input_lens, output_lens
+            inputs.append(waves)
+            targets.append(labels)
+            in_lens.append(inputs_len)
+            tar_lens.append(targets_len)
+            if i >= data_size:
+                break
+
+        return inputs, targets, in_lens, tar_lens
+
+    @classmethod
+    def get_dataset(
+        cls,
+        data_dir: str,
+        calinum=100,
+        fixed_out_token=300,
+        labels=None,
+    ):
+        if labels is None:
+            labels = [" ", *"abcdefghijklmnopqrstuvwxyz", "'", "*"]
+        dataset = cls._get_voice_dataset(calinum, data_dir, labels, fixed_out_token)
+        example_input = [(dataset[0][i],) for i in range(min(calinum, len(dataset[0])))]
+        return example_input
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-c",
+        "--chipset",
+        default="E9955",
+        help="Samsung chipset, i.e. E9945, E9955, etc",
+        type=str,
+    )
+    parser.add_argument(
+        "-d",
+        "--dataset",
+        default=None,
+        help=(
+            "path to the validation folder of ImageNet dataset. "
+            "e.g. --dataset imagenet-mini/val "
+            "for https://www.kaggle.com/datasets/ifigotin/imagenetmini-1000)"
+        ),
+        type=str,
+    )
+
+    parser.add_argument(
+        "-p",
+        "--precision",
+        default=None,
+        help=("Quantizaiton precision. If not set, the model will not be quantized."),
+        choices=[None, "A8W8"],
+        type=str,
+    )
+
+    parser.add_argument(
+        "-cn",
+        "--calibration_number",
+        default=100,
+        help=(
+            "Assign the number of data you want "
+            "to use for calibrating the quant params."
+        ),
+        type=int,
+    )
+
+    parser.add_argument(
+        "--dump",
+        default=False,
+        const=True,
+        nargs="?",
+        help=("Whether to dump all outputs. If not set, we only dump pte."),
+        type=bool,
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weight",
+        default=None,
+        help="Absolute path of retrained w2l weight (With .pt format), the vocab size should 29",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. ",
+        default="./wav2letter",
+        type=str,
+    )
+
+    args = parser.parse_args()
+
+    # ensure the working directory exist.
+    os.makedirs(args.artifact, exist_ok=True)
+
+    # build pte
+    pte_filename = "wav2letter"
+    instance = Wav2LetterModel()
+    instance.vocab_size = 29
+    model = instance.get_eager_model().eval()
+    if args.weight:
+        weight = torch.load(args.weight, weights_only=True)
+        model.load_state_dict(weight)
+    assert args.calibration_number
+    if args.dataset:
+        inputs = DataManager.get_dataset(
+            data_dir=f"{args.dataset}",
+            calinum=args.calibration_number,
+        )
+    else:
+        inputs = [instance.get_example_inputs() for _ in range(args.calibration_number)]
+
+    test_in = inputs[0]
+    float_out = model(*test_in)
+
+    compile_specs = [gen_samsung_backend_compile_spec(args.chipset)]
+
+    if args.precision:
+        model = quantize_module(
+            model, inputs[0], inputs, getattr(Precision, args.precision)
+        )
+        quant_out = model(*test_in)
+
+    edge_prog = to_edge_transform_and_lower_to_enn(
+        model, inputs[0], compile_specs=compile_specs
+    )
+
+    edge = edge_prog.to_backend(EnnPartitioner(compile_specs))
+    exec_prog = edge.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=True)
+    )
+    save_pte_program(exec_prog, pte_filename, os.path.join(f"{args.artifact}"))
+
+    if args.dump:
+        save_tensors(test_in, "float_in", args.artifact)
+        save_tensors(float_out, "float_out", args.artifact)
+        if args.precision:
+            save_tensors(quant_out, "quant_out", args.artifact)
diff --git a/examples/selective_build/advanced/CMakeLists.txt b/examples/selective_build/advanced/CMakeLists.txt
index 65ebb50bcac..fdef5e6555d 100644
--- a/examples/selective_build/advanced/CMakeLists.txt
+++ b/examples/selective_build/advanced/CMakeLists.txt
@@ -139,7 +139,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags ${selected_kernel_target}
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          extension_flat_tensor
+          extension_data_loader
+          ${selected_kernel_target}
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/examples/selective_build/basic/CMakeLists.txt b/examples/selective_build/basic/CMakeLists.txt
index 3cc68ad53b6..d74f94d7b3a 100644
--- a/examples/selective_build/basic/CMakeLists.txt
+++ b/examples/selective_build/basic/CMakeLists.txt
@@ -71,7 +71,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 target_link_libraries(
   selective_build_test
-  PRIVATE executorch_core extension_evalue_util extension_runner_util
-          gflags::gflags executorch_kernels
+  PRIVATE executorch_core
+          extension_evalue_util
+          extension_runner_util
+          gflags::gflags
+          executorch_kernels
+          extension_data_loader
+          extension_flat_tensor
 )
 target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
diff --git a/examples/selective_build/targets.bzl b/examples/selective_build/targets.bzl
index 72639fef842..bd11a53e3e0 100644
--- a/examples/selective_build/targets.bzl
+++ b/examples/selective_build/targets.bzl
@@ -1,6 +1,118 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "is_xplat", "runtime")
 load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib", "ScalarType")
 
+def define_selective_build_prim_ops_example():
+    """
+    Example showing how selected_prim_operators_genrule works to combine
+    prim ops headers from multiple dependencies.
+    """
+
+    # Define several operator libraries with automatic prim ops extraction
+    et_operator_library(
+        name = "model_a_ops",
+        ops = [
+            "aten::add.out",
+            "aten::mul.out",
+            "executorch_prim::et_view.default",    # Auto-extracted to prim ops
+            "aten::sym_size.int",                  # Auto-extracted to prim ops
+        ],
+        visibility = ["//executorch/..."],
+    )
+    # This creates: "model_a_ops" + "model_a_ops_selected_prim_ops"
+
+    et_operator_library(
+        name = "model_b_ops",
+        ops = [
+            "aten::sub.out",
+            "aten::div.out",
+            "executorch_prim::add.Scalar",         # Auto-extracted to prim ops
+            "aten::sym_numel.int",                 # Auto-extracted to prim ops
+        ],
+        visibility = ["//executorch/..."],
+    )
+    # This creates: "model_b_ops" + "model_b_ops_selected_prim_ops"
+
+    # Define a manual prim ops target as well
+    et_operator_library(
+        name = "extra_prim_ops",
+        ops = [
+            "executorch_prim::mul.Scalar",
+            "executorch_prim::sym_max.Scalar",
+        ],
+        visibility = ["//executorch/..."],
+    )
+    # Use the combined header in an executorch_generated_lib
+    executorch_generated_lib(
+        name = "library_with_combined_prim_ops",
+        deps = [
+            ":model_a_ops",
+            ":model_b_ops",
+            ":extra_prim_ops",
+        ],
+        kernel_deps = [
+            "//executorch/kernels/portable:operators",
+        ],
+        functions_yaml_target = "//executorch/kernels/portable:functions.yaml",
+        aten_mode = False,
+        visibility = ["PUBLIC"],
+        include_all_prim_ops = False,
+    )
+
+    # Prim ops selected separately
+    et_operator_library(
+        name = "model_b_ops_no_prim_ops",
+        ops = [
+            "aten::sub.out",
+            "aten::div.out",
+        ],
+        visibility = ["//executorch/..."],
+    )
+
+    # Use the combined header in an executorch_generated_lib
+    executorch_generated_lib(
+        name = "library_with_combined_prim_ops_1",
+        deps = [
+            ":model_b_ops_no_prim_ops",
+            ":extra_prim_ops",
+        ],
+        kernel_deps = [
+            "//executorch/kernels/portable:operators",
+        ],
+        functions_yaml_target = "//executorch/kernels/portable:functions.yaml",
+        aten_mode = False,
+        visibility = ["PUBLIC"],
+        include_all_prim_ops = False,
+    )
+
+    # No prim ops selected. So include all prim ops.
+    executorch_generated_lib(
+        name = "library_with_combined_prim_ops_2",
+        deps = [
+            ":model_b_ops_no_prim_ops",
+        ],
+        kernel_deps = [
+            "//executorch/kernels/portable:operators",
+        ],
+        functions_yaml_target = "//executorch/kernels/portable:functions.yaml",
+        aten_mode = False,
+        visibility = ["PUBLIC"],
+        include_all_prim_ops = False,
+    )
+
+    # default to selecting all prim ops
+    executorch_generated_lib(
+        name = "library_with_all_prim_ops",
+        deps = [
+            ":model_b_ops",
+        ],
+        kernel_deps = [
+            "//executorch/kernels/portable:operators",
+        ],
+        functions_yaml_target = "//executorch/kernels/portable:functions.yaml",
+        aten_mode = False,
+        visibility = ["PUBLIC"],
+    )
+
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -165,3 +277,5 @@ def define_common_targets():
         define_static_target = True,
         **get_oss_build_kwargs()
     )
+
+    define_selective_build_prim_ops_example()
diff --git a/examples/vulkan/export.py b/examples/vulkan/export.py
index 4d85d83c862..dace37e5473 100644
--- a/examples/vulkan/export.py
+++ b/examples/vulkan/export.py
@@ -14,22 +14,18 @@
 import backends.vulkan.test.utils as test_utils
 
 import torch
+import torchvision
 
-from executorch.backends.transforms.convert_dtype_pass import I64toI32
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.devtools import BundledProgram
 from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
 from executorch.devtools.bundled_program.serialize import (
     serialize_from_bundled_program_to_flatbuffer,
 )
-from executorch.exir import (
-    EdgeCompileConfig,
-    ExecutorchBackendConfig,
-    to_edge_transform_and_lower,
-)
+from executorch.exir import to_edge_transform_and_lower
 from executorch.extension.export_util.utils import save_pte_program
 from executorch.extension.pytree import tree_flatten
-from torch.export import export
+from torch.export import Dim, export
 
 from ..models import MODEL_NAME_TO_MODEL
 from ..models.model_factory import EagerModelFactory
@@ -38,6 +34,67 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
 
+def is_vision_model(model_name):
+    if model_name in [
+        # These models are also registered in examples/models
+        "dl3",
+        "edsr",
+        "mv2",
+        "mv3",
+        "vit",
+        "ic3",
+        "ic4",
+        "resnet18",
+        "resnet50",
+        # These models are not registered in examples/models but are available via
+        # torchvision
+        "convnext_small",
+        "densenet161",
+        "shufflenet_v2_x1_0",
+    ]:
+        return True
+
+    return False
+
+
+def get_vision_model_sample_input():
+    return (torch.randn(1, 3, 224, 224),)
+
+
+def get_vision_model_dynamic_shapes():
+    return (
+        {
+            2: Dim("height", min=1, max=16) * 16,
+            3: Dim("width", min=1, max=16) * 16,
+        },
+    )
+
+
+def init_model(model_name):
+    if model_name == "convnext_small":
+        return torchvision.models.convnext_small()
+    if model_name == "densenet161":
+        return torchvision.models.densenet161()
+    if model_name == "shufflenet_v2_x1_0":
+        return torchvision.models.shufflenet_v2_x1_0()
+
+    return None
+
+
+def get_sample_inputs(model_name):
+    if is_vision_model(model_name):
+        return get_vision_model_sample_input()
+
+    return None
+
+
+def get_dynamic_shapes(model_name):
+    if is_vision_model(model_name):
+        return get_vision_model_dynamic_shapes()
+
+    return None
+
+
 def main() -> None:
     logger = logging.getLogger("")
     logger.setLevel(logging.INFO)
@@ -68,21 +125,6 @@ def main() -> None:
         help="whether to export with strict mode. Default is True",
     )
 
-    parser.add_argument(
-        "-a",
-        "--segment_alignment",
-        required=False,
-        help="specify segment alignment in hex. Default is 0x1000. Use 0x4000 for iOS",
-    )
-
-    parser.add_argument(
-        "-e",
-        "--external_constants",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Save constants in external .ptd file. Default is False",
-    )
-
     parser.add_argument(
         "-d",
         "--dynamic",
@@ -119,31 +161,35 @@ def main() -> None:
 
     args = parser.parse_args()
 
-    if args.model_name not in MODEL_NAME_TO_MODEL:
-        raise RuntimeError(
-            f"Model {args.model_name} is not a valid name. "
-            f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+    if args.model_name in MODEL_NAME_TO_MODEL:
+        model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
+            *MODEL_NAME_TO_MODEL[args.model_name]
         )
+    else:
+        model = init_model(args.model_name)
+        example_inputs = get_sample_inputs(args.model_name)
+        dynamic_shapes = get_dynamic_shapes(args.model_name) if args.dynamic else None
 
-    model, example_inputs, _, dynamic_shapes = EagerModelFactory.create_model(
-        *MODEL_NAME_TO_MODEL[args.model_name]
-    )
+        if model is None:
+            raise RuntimeError(
+                f"Model {args.model_name} is not a valid name. "
+                f"Available models are {list(MODEL_NAME_TO_MODEL.keys())}."
+            )
 
     # Prepare model
     model.eval()
 
     # Setup compile options
     compile_options = {}
-    if args.dynamic or dynamic_shapes is not None:
+    if args.dynamic:
         compile_options["require_dynamic_shapes"] = True
+        # Try to manually get the dynamic shapes for the model if not set
+        if dynamic_shapes is None:
+            dynamic_shapes = get_dynamic_shapes(args.model_name)
+
     if args.force_fp16:
         compile_options["force_fp16"] = True
 
-    # Configure Edge compilation
-    edge_compile_config = EdgeCompileConfig(
-        _skip_dim_order=False,  # Proper handling for Vulkan memory format
-    )
-
     logging.info(f"Exporting model {args.model_name} with Vulkan delegate")
 
     # Export the model using torch.export
@@ -157,10 +203,6 @@ def main() -> None:
     # Transform and lower with Vulkan partitioner
     edge_program = to_edge_transform_and_lower(
         program,
-        compile_config=edge_compile_config,
-        transform_passes=[
-            I64toI32(edge_compile_config._skip_dim_order),
-        ],
         partitioner=[VulkanPartitioner(compile_options)],
         generate_etrecord=args.etrecord,
     )
@@ -169,13 +211,8 @@ def main() -> None:
         f"Exported and lowered graph:\n{edge_program.exported_program().graph}"
     )
 
-    # Configure backend options
-    backend_config = ExecutorchBackendConfig(external_constants=args.external_constants)
-    if args.segment_alignment is not None:
-        backend_config.segment_alignment = int(args.segment_alignment, 16)
-
     # Create executorch program
-    exec_prog = edge_program.to_executorch(config=backend_config)
+    exec_prog = edge_program.to_executorch()
 
     # Save ETRecord if requested
     if args.etrecord:
@@ -210,7 +247,7 @@ def main() -> None:
         else:
             logging.error("✗ Model test FAILED - outputs do not match reference")
             raise RuntimeError(
-                "Model validation failed: ExecutorTorch outputs do not match reference model outputs"
+                "Model validation failed: ExecuTorch outputs do not match reference model outputs"
             )
 
     if args.bundled:
diff --git a/examples/xnnpack/aot_compiler.py b/examples/xnnpack/aot_compiler.py
index 81eeb75c72c..9a78138adf3 100644
--- a/examples/xnnpack/aot_compiler.py
+++ b/examples/xnnpack/aot_compiler.py
@@ -61,6 +61,14 @@
         default="",
         help="Generate and save an ETRecord to the given file location",
     )
+    parser.add_argument(
+        "-t",
+        "--test_after_export",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Test the pte with pybindings",
+    )
     parser.add_argument("-o", "--output_dir", default=".", help="output directory")
 
     args = parser.parse_args()
@@ -117,3 +125,24 @@
     quant_tag = "q8" if args.quantize else "fp32"
     model_name = f"{args.model_name}_xnnpack_{quant_tag}"
     save_pte_program(exec_prog, model_name, args.output_dir)
+
+    if args.test_after_export:
+        logging.info("Testing the pte with pybind")
+        from executorch.extension.pybindings.portable_lib import (
+            _load_for_executorch_from_buffer,
+        )
+
+        # Import custom ops. This requires portable_lib to be loaded first.
+        from executorch.extension.llm.custom_ops import (  # noqa: F401, F403
+            custom_ops,
+        )  # usort: skip
+
+        # Import quantized ops. This requires portable_lib to be loaded first.
+        from executorch.kernels import quantized  # usort: skip # noqa: F401, F403
+        from torch.utils._pytree import tree_flatten
+
+        m = _load_for_executorch_from_buffer(exec_prog.buffer)
+        logging.info("Successfully loaded the model")
+        flattened = tree_flatten(example_inputs)[0]
+        res = m.run_method("forward", flattened)
+        logging.info("Successfully ran the model")
diff --git a/exir/TARGETS b/exir/TARGETS
index 853d5e199ba..402e9a21bd1 100644
--- a/exir/TARGETS
+++ b/exir/TARGETS
@@ -79,6 +79,16 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "tensor_layout",
+    srcs = [
+        "tensor_layout.py",
+    ],
+    deps = [
+        ":scalar_type",
+    ]
+)
+
 runtime.python_library(
     name = "memory",
     srcs = [
diff --git a/exir/_serialize/TARGETS b/exir/_serialize/TARGETS
index 83a2d4957ce..51bad73ab5c 100644
--- a/exir/_serialize/TARGETS
+++ b/exir/_serialize/TARGETS
@@ -10,7 +10,6 @@ runtime.cxx_python_extension(
         "bindings.cpp",
     ],
     visibility = [
-        "//executorch/backends/fb/qnnpack/...",
         "//executorch/backends/vulkan/...",
         "//executorch/backends/xnnpack/...",
         "//executorch/devtools/bundled_program/...",
@@ -65,5 +64,6 @@ runtime.python_library(
     deps = [
         "//executorch/exir:schema",
         "//executorch/exir:tensor",
+        "//executorch/exir:tensor_layout",
     ],
 )
diff --git a/exir/_serialize/_serialize.py b/exir/_serialize/_serialize.py
index e2147458545..06e81997654 100644
--- a/exir/_serialize/_serialize.py
+++ b/exir/_serialize/_serialize.py
@@ -16,12 +16,12 @@
     DataEntry,
     DataPayload,
     DataSerializer,
-    TensorLayout,
 )
 
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.emit import EmitterOutput
 from executorch.exir.schema import Tensor, TensorDataLocation
+from executorch.exir.tensor_layout import TensorLayout
 
 
 def serialize_for_executorch(
diff --git a/exir/_serialize/data_serializer.py b/exir/_serialize/data_serializer.py
index e828b4d0ae3..cee34506b66 100644
--- a/exir/_serialize/data_serializer.py
+++ b/exir/_serialize/data_serializer.py
@@ -3,7 +3,7 @@
 from typing import Dict, Optional, Sequence
 
 from executorch.exir._serialize._cord import Cord
-from executorch.extension.flat_tensor.serialize.flat_tensor_schema import TensorLayout
+from executorch.exir.tensor_layout import TensorLayout
 
 
 @dataclass
diff --git a/exir/backend/test/demos/TARGETS b/exir/backend/test/demos/TARGETS
index f18d57ab1c7..d3cc5d62710 100644
--- a/exir/backend/test/demos/TARGETS
+++ b/exir/backend/test/demos/TARGETS
@@ -1,30 +1,5 @@
 load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
 
-python_unittest(
-    name = "test_xnnpack_qnnpack",
-    srcs = [
-        "test_xnnpack_qnnpack.py",
-    ],
-    preload_deps = [
-        "//executorch/kernels/portable:custom_ops_generated_lib",
-        "//executorch/kernels/quantized:custom_ops_generated_lib",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/fb/qnnpack:qnnpack_backend",
-        "//executorch/backends/fb/qnnpack:qnnpack_preprocess",
-        "//executorch/backends/fb/qnnpack/partition:qnnpack_partitioner",
-        "//executorch/backends/xnnpack:xnnpack_backend",
-        "//executorch/backends/xnnpack:xnnpack_preprocess",
-        "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
-        "//executorch/exir:lib",
-        "//executorch/exir/backend:backend_api",
-        "//executorch/exir/passes:spec_prop_pass",
-        "//executorch/extension/pybindings:portable_lib",  # @manual
-        "//executorch/extension/pytree:pylib",
-    ],
-)
-
 python_unittest(
     name = "test_delegate_aten_mode",
     srcs = [
diff --git a/exir/backend/test/demos/rpc/CMakeLists.txt b/exir/backend/test/demos/rpc/CMakeLists.txt
index 97f90ea9baa..af843954601 100644
--- a/exir/backend/test/demos/rpc/CMakeLists.txt
+++ b/exir/backend/test/demos/rpc/CMakeLists.txt
@@ -36,7 +36,7 @@ target_include_directories(
 )
 install(
   TARGETS executor_backend
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/exir/backend/test/demos/test_xnnpack_qnnpack.py b/exir/backend/test/demos/test_xnnpack_qnnpack.py
deleted file mode 100644
index 7600988e19d..00000000000
--- a/exir/backend/test/demos/test_xnnpack_qnnpack.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import tempfile
-import unittest
-
-from typing import Tuple
-
-import executorch.exir as exir
-
-import torch
-from executorch.backends.fb.qnnpack.partition.qnnpack_partitioner import (
-    QnnpackPartitioner,
-)
-from executorch.backends.fb.qnnpack.qnnpack_preprocess import QnnpackBackend
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
-    XnnpackFloatingPointPartitioner,
-)
-
-# import the xnnpack backend implementation
-from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend
-
-from executorch.exir import (
-    CaptureConfig,
-    EdgeCompileConfig,
-    EdgeProgramManager,
-    to_edge_transform_and_lower,
-)
-
-from executorch.exir.backend.backend_api import to_backend, validation_disabled
-from executorch.exir.passes.spec_prop_pass import SpecPropPass
-
-from executorch.extension.pybindings.portable_lib import (  # @manual
-    _load_for_executorch_from_buffer,
-)
-from executorch.extension.pytree import tree_flatten
-from torch.ao.quantization.backend_config.executorch import (
-    get_executorch_backend_config,
-)
-from torch.ao.quantization.observer import (
-    default_dynamic_quant_observer,
-    default_per_channel_weight_observer,
-)
-from torch.ao.quantization.qconfig_mapping import QConfig, QConfigMapping
-from torch.ao.quantization.quantize_fx import (
-    _convert_to_reference_decomposed_fx,
-    prepare_fx,
-)
-
-
-class TestXnnQnnBackends(unittest.TestCase):
-    def test_add_xnnpack_and_dqlinear_qnn(self):
-        qconfig_mapping = QConfigMapping().set_object_type(
-            torch.nn.Linear,
-            QConfig(
-                activation=default_dynamic_quant_observer,
-                weight=default_per_channel_weight_observer,
-            ),
-        )
-        in_size = 1
-        in_features = 3
-        out_features = 4
-
-        class LinearAndAdd(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_features, out_features)
-
-            def forward(self, x, y):
-                return self.linear(x) + y
-
-        linear_and_add_mod = LinearAndAdd()
-
-        example_inputs = (
-            torch.ones(in_size, in_features, dtype=torch.float),
-            torch.ones(in_size, out_features, dtype=torch.float),
-        )
-
-        prepared_mod = prepare_fx(
-            linear_and_add_mod,
-            qconfig_mapping,
-            example_inputs,
-            backend_config=get_executorch_backend_config(),
-        )
-
-        converted_mod: torch.fx.GraphModule = _convert_to_reference_decomposed_fx(
-            prepared_mod
-        )
-
-        # Step 2: EXIR capturing
-        capture_config = CaptureConfig(enable_dynamic_shape=False)
-        captured_mod = exir.capture(
-            converted_mod, example_inputs, config=capture_config
-        ).to_edge(
-            exir.EdgeCompileConfig(
-                _check_ir_validity=False,
-            )
-        )
-
-        # Step 3.1: Lower dynamic quant linear to qnnpack
-        with validation_disabled():
-            module_with_qnnpack_delegate = captured_mod
-            module_with_qnnpack_delegate.exported_program = to_backend(
-                captured_mod.exported_program, QnnpackPartitioner()
-            )
-
-        # Step 3.2: Lower add to xnnpack
-        with validation_disabled():
-            module_with_xnn_and_qnn = module_with_qnnpack_delegate
-            module_with_xnn_and_qnn.exported_program = to_backend(
-                module_with_qnnpack_delegate.exported_program,
-                XnnpackFloatingPointPartitioner(),
-            )
-
-        program_with_delegates = module_with_xnn_and_qnn.to_executorch(
-            exir.ExecutorchBackendConfig(passes=[SpecPropPass()]),
-        )
-        # The first delegate backend is Qnnpack
-        self.assertEqual(
-            program_with_delegates.program.execution_plan[0].delegates[0].id,
-            QnnpackBackend.__name__,
-        )
-        # The second delegate backend is Xnnpack
-        self.assertEqual(
-            program_with_delegates.program.execution_plan[0].delegates[1].id,
-            XnnpackBackend.__name__,
-        )
-
-        executorch_module = _load_for_executorch_from_buffer(
-            program_with_delegates.buffer
-        )
-        inputs_flattened, _ = tree_flatten(example_inputs)
-
-        model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
-        ref_output = captured_mod(*example_inputs)
-
-        # Compare the result from executor and eager mode direclty
-        self.assertTrue(
-            torch.allclose(model_output[0], ref_output, atol=1e-03, rtol=1e-03)
-        )
-
-    def test_serde(self):
-        # The module with blank_logprobs() function
-        class BlankLogProbsModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.linear = torch.nn.Linear(768, 1)
-                self.log_sigmoid = torch.nn.LogSigmoid()
-
-            def forward(self, joint_encodings: torch.Tensor) -> torch.Tensor:
-                tanh_out = torch.tanh(joint_encodings)
-                linear_out = self.linear(tanh_out)
-                blank_output = self.log_sigmoid(linear_out)
-                return blank_output
-
-        def get_blank_logprobs_inputs_fn() -> Tuple[torch.Tensor, ...]:
-            """
-            Get the input to the blank_logprobs() and nonblank_logprobs() functions.
-            """
-            return (torch.randn(1, 1, 1, 768),)
-
-        model = BlankLogProbsModule()
-        # Get the inputs for the logprobs function
-        logprobs_fake_inputs = get_blank_logprobs_inputs_fn()
-
-        # Export and partition
-        aten_prog = torch.export.export(model, logprobs_fake_inputs, strict=True)
-        partitioned_prog: EdgeProgramManager = to_edge_transform_and_lower(
-            aten_prog,
-            partitioner=[XnnpackFloatingPointPartitioner()],
-            compile_config=EdgeCompileConfig(
-                _check_ir_validity=False,
-                _use_edge_ops=True,
-            ),
-        )
-
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
-            exir.save(partitioned_prog.exported_program(), f.name)
-            f.seek(0)
-            loaded_model = exir.load(f.name)
-
-        self.assertTrue(
-            torch.allclose(
-                model(*logprobs_fake_inputs),
-                loaded_model.module()(*logprobs_fake_inputs),
-            )
-        )
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 0618871bd40..d25ee3c538b 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -164,7 +164,6 @@ def emit_program(
             operators=[],
             delegates=[],
             operator_cache={},
-            delegate_cache={},
             emit_stacktrace=emit_stacktrace,
             emit_mutable_buffer_names=emit_mutable_buffer_names,
         )
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
index 6995f9f73a9..15e0b23d36f 100644
--- a/exir/emit/_emitter.py
+++ b/exir/emit/_emitter.py
@@ -93,7 +93,8 @@
 from executorch.exir.types import LeafValueSpec, ValueSpec
 from torch._subclasses.fake_tensor import FakeTensor
 
-from torch.export.exported_program import ExportedProgram
+from torch.export.exported_program import ExportedProgram, ExportGraphSignature
+from torch.fx.node import Node
 from torch.utils import _pytree as pytree
 
 from typing_extensions import TypeAlias
@@ -146,8 +147,6 @@ class _EmitterState:
     operators: List[Operator]
     delegates: List[BackendDelegate]
     operator_cache: Dict[Tuple[str, str], int]
-    # delegate_cache: the key is hash(delegated_payload) and the value is the index in delegates
-    delegate_cache: Dict[str, int]
     emit_stacktrace: bool
     emit_mutable_buffer_names: bool
 
@@ -209,11 +208,11 @@ class _AbstractValue:
 ]
 
 
-# pyre-ignore[13]: Attribute `node` is never initialized.
 class _Emitter(torch.fx.Interpreter):
     """An abstract interpreter (https://wiki.mozilla.org/Abstract_Interpretation) used to emit the
     given traced torch.fx.GraphModule to the flatbuffer schema."""
 
+    # pyre-ignore[13]: Attribute `node` is never initialized.
     node: torch.fx.Node
 
     def __init__(
@@ -1091,7 +1090,7 @@ def _emit_delegate(
         delegate's blob."""
         processed_bytes = lowered_module.processed_bytes
         hashed = hashlib.sha256(processed_bytes).hexdigest()
-        delegate_index = self.emitter_state.delegate_cache.get(hashed)
+        delegate_index = self.program_state.backend_delegate_data_cache.get(hashed)
         delegate_ret = None
 
         if isinstance(self.node.meta["spec"], list):
@@ -1129,28 +1128,20 @@ def _emit_delegate(
         if delegate_index is None:
             # Allocate an entry for the data. TODO(T150113674): Reuse any duplicate entries if
             # present.
-            hashed = hashlib.sha256(processed_bytes).hexdigest()
-            data_index: Optional[int] = (
-                self.program_state.backend_delegate_data_cache.get(hashed)
+            delegate_index = len(self.program_state.backend_delegate_data_cache)
+            self.program_state.backend_delegate_data_cache[hashed] = delegate_index
+            self.program_state.backend_delegate_data.append(
+                BackendDelegateInlineData(data=processed_bytes)
             )
-            if data_index is None:
-                data_index = len(self.program_state.backend_delegate_data)
-                self.program_state.backend_delegate_data_cache[hashed] = data_index
-                self.program_state.backend_delegate_data.append(
-                    BackendDelegateInlineData(data=processed_bytes)
-                )
-
-            backend_delegate = BackendDelegate(
-                id=lowered_module.backend_id,
-                processed=BackendDelegateDataReference(
-                    location=DataLocation.INLINE, index=data_index
-                ),
-                compile_specs=lowered_module.compile_specs,
-            )
-            delegate_index = len(self.emitter_state.delegate_cache)
-            self.emitter_state.delegates.append(backend_delegate)
-            self.emitter_state.delegate_cache[hashed] = delegate_index
 
+        backend_delegate = BackendDelegate(
+            id=lowered_module.backend_id,
+            processed=BackendDelegateDataReference(
+                location=DataLocation.INLINE, index=delegate_index
+            ),
+            compile_specs=lowered_module.compile_specs,
+        )
+        self.emitter_state.delegates.append(backend_delegate)
         # TODO(angelayi) Will need to emit the kwargs too, in the correct order according to the
         # function's spec and with default arguments. This requires us to store the function's spec
         # in to_backend()
@@ -1163,7 +1154,12 @@ def _emit_delegate(
             delegate_args.append(elem.id)
 
         self.chain.instructions.append(
-            Instruction(DelegateCall(delegate_index=delegate_index, args=delegate_args))
+            Instruction(
+                DelegateCall(
+                    delegate_index=len(self.emitter_state.delegates) - 1,
+                    args=delegate_args,
+                )
+            )
         )
 
         return delegate_ret
@@ -1633,6 +1629,28 @@ def placeholder(  # noqa: C901
         if isinstance(target, str) and isinstance(spec, TensorSpec):
             fqn, is_mutable_buffer = self._find_fqn_for_placeholder(target, spec)
 
+            def _is_buffer(node: Node, graph_signature: ExportGraphSignature) -> bool:
+                """
+                Check if the node is buffer according to the provided graph signature.
+                If it is one return its fqn as well
+                """
+                if node.op == "placeholder":
+                    if isinstance(node.target, str):
+                        if node.target in graph_signature.inputs_to_buffers:
+                            return True
+                return False
+
+            # If the spec does not appear in the mutable section of the graph signature it still might
+            # overall be considered a mutable buffer if it has already been memory planned. This would
+            # suggest that the same abstract buffer is mutable in another entry point so we should
+            # compel it to be considered mutable in all entry points at emission just as the user did with
+            # memory planning.
+            is_mutable_buffer |= (
+                _is_buffer(self.node, self.exported_program.graph_signature)
+                and spec.mem_id is not None
+                and spec.mem_offset is not None
+            )
+
             # If the placeholder has a constant_tag, it is external to the PTE file
             # and requires a fqn and location=TensorDataLocation.EXTERNAL
             if constant_tag is not None:
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
index 43b5fcfa99b..dcc3544875a 100644
--- a/exir/emit/test/test_emit.py
+++ b/exir/emit/test/test_emit.py
@@ -1770,6 +1770,60 @@ def forward(self, x):
             len(edge_program_manager.executorch_program.backend_delegate_data), 1
         )
 
+    def test_delegate_deduplicate_with_different_compile_specs(self) -> None:
+        class LowerableSubModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+        lowered = LowerableSubModel()
+        example_input = (torch.ones(1),)
+
+        lowered_edge = to_edge(export(lowered, example_input))
+
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+        compile_specs1 = [CompileSpec("config", b"fast")]
+        compile_specs2 = [CompileSpec("config", b"small")]
+        lowered_module1 = to_backend(
+            "BackendWithCompilerDemo", lowered_edge.exported_program(), compile_specs1
+        )
+        lowered_module2 = to_backend(
+            "BackendWithCompilerDemo", lowered_edge.exported_program(), compile_specs2
+        )
+
+        class CompositeModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lowerable1 = lowered_module1
+                self.lowerable2 = lowered_module2
+
+            def forward(self, x):
+                a = self.lowerable1(x)
+                b = self.lowerable2(a)
+                return a, b
+
+        composite_model = CompositeModel()
+        model_inputs = (torch.ones(1),)
+        edge_prog = to_edge(export(composite_model, model_inputs)).to_executorch()
+
+        exported_program = edge_prog.exported_program()
+        program = emit_program({"method1": exported_program}, False).program
+        self.assertEqual(len(program.execution_plan), 1)
+
+        plan = program.execution_plan[0]
+        # Two delegates that point to the same blob.
+        self.assertEqual(len(plan.delegates), 2)
+        self.assertEqual(plan.delegates[0].processed.index, 0)
+        self.assertEqual(plan.delegates[1].processed.index, 0)
+        # Compile specs are different.
+        self.assertEqual(plan.delegates[0].compile_specs, compile_specs1)
+        self.assertEqual(plan.delegates[1].compile_specs, compile_specs2)
+        # Only one delegate blob in the backend_delegate_data.
+        self.assertEqual(len(program.backend_delegate_data), 1)
+
     def test_constant_tagged_mutable_tensors(self) -> None:
         class Net(nn.Module):
             def __init__(self):
@@ -1920,7 +1974,7 @@ def forward(self, x):
             program_buffer = et_program.buffer
             et_module = _load_for_executorch_from_buffer(program_buffer)
             for _, (inp, expected) in enumerate(zip(test_inputs, reference_outputs)):
-                # Execute with ExecutorTorch
+                # Execute with ExecuTorch
                 et_output = et_module.forward([inp])
                 et_result = et_output[0]  # Get first output
                 # Compare results
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
index e08d3e55772..0394ed9c529 100644
--- a/exir/memory_planning.py
+++ b/exir/memory_planning.py
@@ -245,6 +245,8 @@ def verify_graph_input_output(self) -> None:
                 assert len(specs) > 0, "Expect tensor specs"
                 specs = list(filter(lambda spec: not spec.const, specs))
                 if len(specs) == 0:
+                    # all outputs are const so no need to allocate memory just say we suceeded
+                    graph_output_allocated = self.alloc_graph_output
                     continue
                 allocated = any(
                     spec is None or spec.mem_offset is not None for spec in specs
@@ -408,6 +410,7 @@ def collect_specs_from_nodes(  # noqa: C901
     ignore_graph_input: bool = False,
     ignore_graph_output: bool = False,
     ignore_mutable_buffers: bool = False,
+    share_mutable_buffers: bool = False,
     ignore_const: bool = True,
     ignore_out_var_node: bool = True,
     dedup: bool = True,
diff --git a/exir/passes/_quant_patterns_and_replacements.py b/exir/passes/_quant_patterns_and_replacements.py
index 54ef522047d..e1678b089b8 100644
--- a/exir/passes/_quant_patterns_and_replacements.py
+++ b/exir/passes/_quant_patterns_and_replacements.py
@@ -986,25 +986,54 @@ def replacement(x, dim, start, end, x_scale, x_zero_point, x_qmin, x_qmax):
     ]
 
 
-def _get_embedding_ops_patterns_and_replacements_torchao() -> (  # noqa C901
-    List[Tuple[Callable, Callable, List[Callable]]]
-):
+def _get_embedding_ops_patterns_and_replacements_torchao(  # noqa C901
+    node_value_dict,
+) -> List[Tuple[Callable, Callable, List[Callable]]]:
+
+    def get_embedding_replacement_filter(has_nonzero_zero_point):
+        def _filter(match, original_graph, pattern_graph):
+            assert node_value_dict is not None, "node_value_dict cannot be None"
+
+            def get_val(name):
+                node = [n for n in match.nodes_map if n.name == name][0]
+                val = match.nodes_map[node]
+                if isinstance(val, torch.fx.Node) and val.target in node_value_dict:
+                    return node_value_dict[val.target]
+                return val
+
+            zero_point = get_val("zero_point")
+            all_zero = (zero_point == 0).all().item()
+            if has_nonzero_zero_point:
+                return not all_zero
+            else:
+                return all_zero
+
+        return _filter
+
     def embedding_byte_pattern(indices, int_data, group_size, scale, zero_point):
         dq = torch.ops.torchao.dequantize_affine.default(
             int_data, [1, group_size], scale, zero_point, torch.int8, -128, 127
         )
         return torch.ops.aten.embedding.default(dq, indices)
 
-    def embedding_byte_replacement(indices, int_data, group_size, scale, zero_point):
-        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
-        return torch.ops.quantized_decomposed.embedding_byte.default(
-            int_data,
-            scale,
-            zero_point_dtype_cast,
-            -128,
-            127,
-            indices,
-        )
+    def get_embedding_byte_replacement(has_nonzero_zero_point):
+        def embedding_byte_replacement(
+            indices, int_data, group_size, scale, zero_point
+        ):
+            zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+            zero_point_dtype_cast = (
+                zero_point_dtype_cast if has_nonzero_zero_point else None
+            )
+            return torch.ops.quantized_decomposed.embedding_byte.default(
+                int_data,
+                scale,
+                zero_point_dtype_cast,
+                -128,
+                127,
+                indices,
+            )
+
+        return embedding_byte_replacement
 
     def embedding_byte_dtype_pattern(
         indices, int_data, group_size, scale, zero_point, output_dtype
@@ -1021,19 +1050,25 @@ def embedding_byte_dtype_pattern(
         )
         return torch.ops.aten.embedding.default(dq, indices)
 
-    def embedding_byte_dtype_replacement(
-        indices, int_data, group_size, scale, zero_point, output_dtype
-    ):
-        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
-        return torch.ops.quantized_decomposed.embedding_byte.dtype(
-            int_data,
-            scale,
-            zero_point_dtype_cast,
-            -128,
-            127,
-            indices,
-            dtype=output_dtype,
-        )
+    def get_embedding_byte_dtype_replacement(has_nonzero_zero_point):
+        def embedding_byte_dtype_replacement(
+            indices, int_data, group_size, scale, zero_point, output_dtype
+        ):
+            zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+            zero_point_dtype_cast = (
+                zero_point_dtype_cast if has_nonzero_zero_point else None
+            )
+            return torch.ops.quantized_decomposed.embedding_byte.dtype(
+                int_data,
+                scale,
+                zero_point_dtype_cast,
+                -128,
+                127,
+                indices,
+                dtype=output_dtype,
+            )
+
+        return embedding_byte_dtype_replacement
 
     def embedding_2bit_pattern(indices, int_data, group_size, scale, zero_point):
         dq = torch.ops.torchao.dequantize_affine.default(
@@ -1041,14 +1076,22 @@ def embedding_2bit_pattern(indices, int_data, group_size, scale, zero_point):
         )
         return torch.ops.aten.embedding.default(dq, indices)
 
-    def embedding_2bit_replacement(indices, int_data, group_size, scale, zero_point):
-        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
-            int_data, 2
-        )
-        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
-        return torch.ops.quantized_decomposed.embedding_2bit.default(
-            packed_int_data, scale, zero_point_dtype_cast, -2, 1, indices
-        )
+    def get_embedding_2bit_replacement(has_nonzero_zero_point):
+        def embedding_2bit_replacement(
+            indices, int_data, group_size, scale, zero_point
+        ):
+            packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
+                int_data, 2
+            )
+            zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+            zero_point_dtype_cast = (
+                zero_point_dtype_cast if has_nonzero_zero_point else None
+            )
+            return torch.ops.quantized_decomposed.embedding_2bit.default(
+                packed_int_data, scale, zero_point_dtype_cast, -2, 1, indices
+            )
+
+        return embedding_2bit_replacement
 
     def embedding_2bit_dtype_pattern(
         indices, int_data, group_size, scale, zero_point, output_dtype
@@ -1065,22 +1108,28 @@ def embedding_2bit_dtype_pattern(
         )
         return torch.ops.aten.embedding.default(dq, indices)
 
-    def embedding_2bit_dtype_replacement(
-        indices, int_data, group_size, scale, zero_point, output_dtype
-    ):
-        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
-            int_data, 2
-        )
-        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
-        return torch.ops.quantized_decomposed.embedding_2bit.dtype(
-            packed_int_data,
-            scale,
-            zero_point_dtype_cast,
-            -2,
-            1,
-            indices,
-            dtype=output_dtype,
-        )
+    def get_embedding_2bit_dtype_replacement(has_nonzero_zero_point):
+        def embedding_2bit_dtype_replacement(
+            indices, int_data, group_size, scale, zero_point, output_dtype
+        ):
+            packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
+                int_data, 2
+            )
+            zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+            zero_point_dtype_cast = (
+                zero_point_dtype_cast if has_nonzero_zero_point else None
+            )
+            return torch.ops.quantized_decomposed.embedding_2bit.dtype(
+                packed_int_data,
+                scale,
+                zero_point_dtype_cast,
+                -2,
+                1,
+                indices,
+                dtype=output_dtype,
+            )
+
+        return embedding_2bit_dtype_replacement
 
     def embedding_4bit_pattern(indices, int_data, group_size, scale, zero_point):
         dq = torch.ops.torchao.dequantize_affine.default(
@@ -1088,14 +1137,22 @@ def embedding_4bit_pattern(indices, int_data, group_size, scale, zero_point):
         )
         return torch.ops.aten.embedding.default(dq, indices)
 
-    def embedding_4bit_replacement(indices, int_data, group_size, scale, zero_point):
-        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
-            int_data, 4
-        )
-        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
-        return torch.ops.quantized_decomposed.embedding_4bit.default(
-            packed_int_data, scale, zero_point_dtype_cast, -8, 7, indices
-        )
+    def get_embedding_4bit_replacement(has_nonzero_zero_point):
+        def embedding_4bit_replacement(
+            indices, int_data, group_size, scale, zero_point
+        ):
+            packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
+                int_data, 4
+            )
+            zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+            zero_point_dtype_cast = (
+                zero_point_dtype_cast if has_nonzero_zero_point else None
+            )
+            return torch.ops.quantized_decomposed.embedding_4bit.default(
+                packed_int_data, scale, zero_point_dtype_cast, -8, 7, indices
+            )
+
+        return embedding_4bit_replacement
 
     def embedding_4bit_dtype_pattern(
         indices, int_data, group_size, scale, zero_point, output_dtype
@@ -1112,53 +1169,97 @@ def embedding_4bit_dtype_pattern(
         )
         return torch.ops.aten.embedding.default(dq, indices)
 
-    def embedding_4bit_dtype_replacement(
-        indices, int_data, group_size, scale, zero_point, output_dtype
-    ):
-        packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
-            int_data, 4
-        )
-        zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
-        return torch.ops.quantized_decomposed.embedding_4bit.dtype(
-            packed_int_data,
-            scale,
-            zero_point_dtype_cast,
-            -8,
-            7,
-            indices,
-            dtype=output_dtype,
-        )
+    def get_embedding_4bit_dtype_replacement(has_nonzero_zero_point):
+        def embedding_4bit_dtype_replacement(
+            indices, int_data, group_size, scale, zero_point, output_dtype
+        ):
+            packed_int_data = torch.ops.quant_fusion._pack_embedding_weight.default(
+                int_data, 4
+            )
+            zero_point_dtype_cast = torch.ops.aten.to.dtype(zero_point, scale.dtype)
+            zero_point_dtype_cast = (
+                zero_point_dtype_cast if has_nonzero_zero_point else None
+            )
+            return torch.ops.quantized_decomposed.embedding_4bit.dtype(
+                packed_int_data,
+                scale,
+                zero_point_dtype_cast,
+                -8,
+                7,
+                indices,
+                dtype=output_dtype,
+            )
+
+        return embedding_4bit_dtype_replacement
 
     return [
         (
             _trace_and_lower_to_edge_ops(embedding_byte_pattern),
-            _trace_and_lower_to_edge_ops(embedding_byte_replacement),
-            [],
+            _trace_and_lower_to_edge_ops(get_embedding_byte_replacement(False)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=False)],
+        ),
+        (
+            _trace_and_lower_to_edge_ops(embedding_byte_pattern),
+            _trace_and_lower_to_edge_ops(get_embedding_byte_replacement(True)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=True)],
         ),
         (
             _trace_and_lower_to_edge_ops(embedding_byte_dtype_pattern),
-            _trace_and_lower_to_edge_ops(embedding_byte_dtype_replacement),
-            [],
+            _trace_and_lower_to_edge_ops(get_embedding_byte_dtype_replacement(False)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=False)],
+        ),
+        (
+            _trace_and_lower_to_edge_ops(embedding_byte_dtype_pattern),
+            _trace_and_lower_to_edge_ops(get_embedding_byte_dtype_replacement(True)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=True)],
         ),
         (
             _trace_and_lower_to_edge_ops(embedding_2bit_pattern),
-            _trace_and_lower_to_edge_ops(embedding_2bit_replacement),
-            [],
+            _trace_and_lower_to_edge_ops(get_embedding_2bit_replacement(False)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=False)],
+        ),
+        (
+            _trace_and_lower_to_edge_ops(embedding_2bit_pattern),
+            _trace_and_lower_to_edge_ops(get_embedding_2bit_replacement(True)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=True)],
         ),
         (
             _trace_and_lower_to_edge_ops(embedding_2bit_dtype_pattern),
-            _trace_and_lower_to_edge_ops(embedding_2bit_dtype_replacement),
-            [],
+            _trace_and_lower_to_edge_ops(get_embedding_2bit_dtype_replacement(False)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=False)],
+        ),
+        (
+            _trace_and_lower_to_edge_ops(embedding_2bit_dtype_pattern),
+            _trace_and_lower_to_edge_ops(get_embedding_2bit_dtype_replacement(True)),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=True)],
         ),
         (
             _trace_and_lower_to_edge_ops(embedding_4bit_pattern),
-            _trace_and_lower_to_edge_ops(embedding_4bit_replacement),
-            [],
+            _trace_and_lower_to_edge_ops(
+                get_embedding_4bit_replacement(has_nonzero_zero_point=False)
+            ),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=False)],
+        ),
+        (
+            _trace_and_lower_to_edge_ops(embedding_4bit_pattern),
+            _trace_and_lower_to_edge_ops(
+                get_embedding_4bit_replacement(has_nonzero_zero_point=True)
+            ),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=True)],
         ),
         (
             _trace_and_lower_to_edge_ops(embedding_4bit_dtype_pattern),
-            _trace_and_lower_to_edge_ops(embedding_4bit_dtype_replacement),
-            [],
+            _trace_and_lower_to_edge_ops(
+                get_embedding_4bit_dtype_replacement(has_nonzero_zero_point=False)
+            ),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=False)],
+        ),
+        (
+            _trace_and_lower_to_edge_ops(embedding_4bit_dtype_pattern),
+            _trace_and_lower_to_edge_ops(
+                get_embedding_4bit_dtype_replacement(has_nonzero_zero_point=True)
+            ),
+            [get_embedding_replacement_filter(has_nonzero_zero_point=True)],
         ),
     ]
 
@@ -1445,9 +1546,9 @@ def replacement(x, x_scale, x_zero_point, x_qmin, x_qmax):
 """
 
 
-def get_quant_patterns_and_replacements() -> (
-    List[Tuple[Callable, Callable, List[Callable]]]
-):
+def get_quant_patterns_and_replacements(
+    node_value_dict,
+) -> List[Tuple[Callable, Callable, List[Callable]]]:
 
     return copy.copy(
         [
@@ -1457,6 +1558,6 @@ def get_quant_patterns_and_replacements() -> (
             *_get_slice_patterns_and_replacements(),
             # *_get_fixed_qparams_ops_patterns_and_replacements(),
             *_get_embedding_ops_patterns_and_replacements(),
-            *_get_embedding_ops_patterns_and_replacements_torchao(),
+            *_get_embedding_ops_patterns_and_replacements_torchao(node_value_dict),
         ]
     )
diff --git a/exir/passes/constant_prop_pass.py b/exir/passes/constant_prop_pass.py
index 7daa3a247e8..06c1c78ee21 100644
--- a/exir/passes/constant_prop_pass.py
+++ b/exir/passes/constant_prop_pass.py
@@ -164,6 +164,14 @@ def get_propagated_const_tensor_dict(
         with torch.no_grad():
             # Execute the `node.target` and create a new propagated constant tensor.
             prop_constant_tensor = node.target(*args_data, **kwargs_data)
+
+            # ExecuTorch doesn't support zero strides, so we need to ensure the tensor is contiguous
+            # if it has any zero strides from broadcasting/expansion operations
+            if (
+                isinstance(prop_constant_tensor, torch.Tensor)
+                and 0 in prop_constant_tensor.stride()
+            ):
+                prop_constant_tensor = prop_constant_tensor.contiguous()
         const_node_to_tensor[node] = prop_constant_tensor
 
     return const_node_to_tensor
diff --git a/exir/passes/memory_planning_pass.py b/exir/passes/memory_planning_pass.py
index 9bd4ab20bf5..abce91c0faa 100644
--- a/exir/passes/memory_planning_pass.py
+++ b/exir/passes/memory_planning_pass.py
@@ -4,10 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import itertools
 import logging
 import warnings
+from dataclasses import dataclass, field
 from functools import partial
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
 
 import torch
 from executorch.exir._warnings import deprecated
@@ -16,14 +18,18 @@
 from executorch.exir.memory_planning import (
     _is_out_var_node,
     apply_algo,
+    collect_specs_from_nodes,
+    filter_nodes,
     get_node_tensor_specs,
     MemoryPlanningAlgorithmSuite,
     Verifier,
 )
 from executorch.exir.operator.convert import get_out_args_from_opoverload
 from executorch.exir.pass_base import PassBase, PassResult
-from executorch.exir.tensor import ALIGNMENT
+from executorch.exir.tensor import ALIGNMENT, TensorSpec
+from torch import fx
 from torch.export.exported_program import ExportGraphSignature
+from torch.fx import Node
 
 
 # copied from https://stackoverflow.com/questions/75582932/python-how-can-i-print-the-function-name-of-a-partial-function
@@ -37,6 +43,106 @@ def _callable_name(any_callable: Callable[..., Any]) -> str:
         return str(any_callable)
 
 
+def _is_buffer(
+    node: Node, graph_signature: ExportGraphSignature
+) -> Tuple[bool, Optional[str]]:
+    """
+    Check if the node is buffer according to the provided graph signature.
+    If it is one return its fqn as well
+    """
+    if node.op == "placeholder":
+        if isinstance(node.target, str):
+            if node.target in graph_signature.inputs_to_buffers:
+                fqn = graph_signature.inputs_to_buffers[node.target]
+                return (True, fqn)
+    return (False, None)
+
+
+def _is_mutable_buffer(
+    node: Node, graph_signature: ExportGraphSignature
+) -> Tuple[bool, Optional[str]]:
+    """
+    Check if the node is mutable buffer according to the provided graph signature.
+    If it is one return its fqn as well
+    """
+    if node.op == "placeholder":
+        if isinstance(node.target, str):
+            if node.target in graph_signature.inputs_to_buffers:
+                fqn = graph_signature.inputs_to_buffers[node.target]
+                # if the buffer is mutated then record that
+                if fqn in graph_signature.buffers_to_mutate.values():
+                    return True, fqn
+    return False, None
+
+
+def _get_spec_from_node(node: fx.Node) -> TensorSpec:
+    specs = get_node_tensor_specs(node)
+    return specs[0]
+
+
+def _insert_mutable_buffer_specs(
+    state: "_MemoryPlanningState", gm: torch.fx.GraphModule, gs: ExportGraphSignature
+):
+    for node in gm.graph.nodes:
+        is_mutable, fqn = _is_mutable_buffer(node, gs)
+        if is_mutable:
+            assert fqn
+            spec = _get_spec_from_node(node)
+            if (
+                getattr(spec, "mem_id", None) is not None
+                or getattr(spec, "mem_offset", None) is not None
+            ):
+                raise ValueError(
+                    "Cannot share mutable buffers if they already have a mem_id or mem_offset assigned"
+                )
+            if fqn not in state.mutable_buffers.keys():
+                state.mutable_buffers[fqn] = set()
+            state.mutable_buffers[fqn].add(spec)
+            continue
+        is_buffer, fqn = _is_buffer(node, gs)
+        # If it is not a mutable buffer it might just appear to be a buffer in this entry point. Think model.get_state()
+        # So cache it and later double check that this buffer never appears mutable
+        if is_buffer:
+            assert fqn
+            spec = _get_spec_from_node(node)
+            if (
+                getattr(spec, "mem_id", None) is not None
+                or getattr(spec, "mem_offset", None) is not None
+            ):
+                raise ValueError(
+                    "Cannot share mutable buffers if they already have a mem_id or mem_offset assigned"
+                )
+            if fqn not in state.maybe_mutable_buffers.keys():
+                state.maybe_mutable_buffers[fqn] = set()
+            state.maybe_mutable_buffers[fqn].add(spec)
+
+
+def _check_default_mem_ids(gm: torch.fx.GraphModule):
+    for node in gm.graph.nodes:
+        for spec in collect_specs_from_nodes(
+            filter_nodes(itertools.chain([node], node.args, node.kwargs.values())),
+            None,
+            ignore_graph_input=False,
+            ignore_const=False,
+            ignore_out_var_node=False,
+            dedup=False,
+            do_assertion=False,
+            ignore_dynamic_unbound_tensor=False,
+        ):
+            mem_id = getattr(spec, "mem_id", None)
+            if mem_id is not None and mem_id != 1:
+                raise ValueError(
+                    "Cannot share mutable buffers if all other tensors are not on the default mem_id of 1"
+                )
+
+
+@dataclass
+class _MemoryPlanningState:
+    mutable_buffers: Dict[str, Set[TensorSpec]] = field(default_factory=dict)
+    maybe_mutable_buffers: Dict[str, Set[TensorSpec]] = field(default_factory=dict)
+    graph_modules: List[torch.fx.GraphModule] = field(default_factory=list)
+
+
 class MemoryPlanningPass(PassBase):
     def __init__(
         self,
@@ -45,6 +151,7 @@ def __init__(
         alloc_graph_input: bool = True,
         alloc_graph_output: bool = True,
         alloc_mutable_buffers: bool = True,
+        share_mutable_buffers: bool = False,
         alignment: int = ALIGNMENT,
     ) -> None:
         r"""
@@ -55,12 +162,18 @@ def __init__(
         """
         if memory_planning_algo is None:
             memory_planning_algo = MemoryPlanningAlgorithmSuite()
+        if share_mutable_buffers and not alloc_mutable_buffers:
+            raise ValueError(
+                "share_mutable_buffers is only meaningful when alloc_mutable_buffers is True"
+            )
         self.memory_planning_algo: Callable[..., List[int]] = memory_planning_algo
         self.allow_lifetime_and_storage_overlap = allow_lifetime_and_storage_overlap
         self.alloc_graph_input = alloc_graph_input
         self.alloc_graph_output = alloc_graph_output
         self.alloc_mutable_buffers = alloc_mutable_buffers
+        self.share_mutable_buffers = share_mutable_buffers
         self.alignment = alignment
+        self.state = _MemoryPlanningState()
 
     def _set_alloc_node_spec(self, graph_module: torch.fx.GraphModule) -> None:
         """
@@ -134,9 +247,17 @@ def run(
             graph_signature,
             self.alloc_graph_input,
             self.alloc_graph_output,
-            self.alloc_mutable_buffers,
+            # If we are sharing the mutable buffers then do not allocate them in
+            # memory planning algo, instead collect all of the specs over all the entry
+            # points and then allocate them directly in the run_multimethod name call
+            self.alloc_mutable_buffers and not self.share_mutable_buffers,
         )
 
+        if self.share_mutable_buffers and graph_signature is not None:
+            self.state.graph_modules.append(graph_module)
+            _check_default_mem_ids(graph_module)
+            _insert_mutable_buffer_specs(self.state, graph_module, graph_signature)
+
         # TODO: make the verifier do the work recursively to handle
         # control flow
         verifier = Verifier(
@@ -164,3 +285,31 @@ def run(
             # I dont know if that is a valid thing but if it is we should adjust verify_storage_reuse function
             verifier.verify_storage_reuse()
         return PassResult(graph_module, True)
+
+    def run_multimethod(self):
+        """Resolve any memory planning done across entry points, called after run is called on all entry points."""
+        if self.share_mutable_buffers:
+            arena: int = 0
+
+            # Every spec that shares an fqn is the same tensor! So we give it the same id and offset
+            # anywhere it appears.
+            for fqn, specs_set in self.state.mutable_buffers.items():
+                specs = list(specs_set)
+                # If the same buffer appears in mutable and maybe mutable then we know it is in fact mutable.
+                if fqn in self.state.maybe_mutable_buffers.keys():
+                    specs.extend(self.state.maybe_mutable_buffers[fqn])
+                for spec in specs:
+                    # Assume a default memory planning placed all activations on 1, place shared state on 2.
+                    spec.mem_id = 2
+                    spec.realign(self.alignment)
+                    # State is persistent, so the memory never overlaps.
+                    spec.mem_offset = arena
+                # They should all be the same size since they are the same tensor, so just bump off the first.
+                arena += specs[0].allocated_memory
+
+            for graph_module in self.state.graph_modules:
+                if len(graph_module.meta["non_const_buffer_sizes"]) != 2:
+                    raise ValueError(
+                        "Cannot share mutable state if not using default memory ids"
+                    )
+                graph_module.meta["non_const_buffer_sizes"].append(arena)
diff --git a/exir/passes/quant_fusion_pass.py b/exir/passes/quant_fusion_pass.py
index 6941fc65229..b46b34f1d19 100644
--- a/exir/passes/quant_fusion_pass.py
+++ b/exir/passes/quant_fusion_pass.py
@@ -9,6 +9,8 @@
 from executorch.exir.pass_base import ExportPass
 from executorch.exir.passes.constant_prop_pass import constant_prop_pass
 from torch.export import ExportedProgram
+from torch.export.exported_program import InputKind
+from torch.export.graph_signature import TensorArgument
 from torch.fx import GraphModule, subgraph_rewriter
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.utils import _pytree as pytree
@@ -104,11 +106,27 @@ def _remove_dtype_getattr_nodes(model: GraphModule) -> None:
     model.recompile()
 
 
+def _get_node_value_dict(program):
+    """
+    Returns a dict of real tensor values for buffers/parameters in the program
+    """
+    node_value_dict = {}
+    for input_ in program.graph_signature.input_specs:
+        if (
+            input_.kind in (InputKind.BUFFER, InputKind.PARAMETER)
+            and isinstance(input_.arg, TensorArgument)
+            and input_.target in program.state_dict
+        ):
+            node_value_dict[input_.arg.name] = program.state_dict[input_.target]
+    return node_value_dict
+
+
 class QuantFusionPass(ExportPass):
-    def __init__(self, _fix_node_meta_val=False):
+    def __init__(self, _fix_node_meta_val=False, node_value_dict=None):
         super().__init__()
         # TODO This pass violate IR spec because it produces a graph missing node.meta['val']
         self._fix_node_meta_val = _fix_node_meta_val
+        self.node_value_dict = node_value_dict
 
     def call(self, graph_module: GraphModule) -> PassResult:
         """Lower a quantized reference model (with reference quantized operator patterns)
@@ -124,7 +142,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             pattern,
             replacement,
             match_filters,
-        ) in get_quant_patterns_and_replacements():
+        ) in get_quant_patterns_and_replacements(self.node_value_dict):
             subgraph_rewriter.replace_pattern_with_filters(
                 graph_module, pattern, replacement, match_filters
             )
@@ -145,7 +163,10 @@ def call(self, graph_module: GraphModule) -> PassResult:
 
 def quant_fusion_and_const_prop_pass(program: ExportedProgram) -> ExportedProgram:
     gm = program.graph_module
-    gm_res = QuantFusionPass(_fix_node_meta_val=True)(gm)
+    node_value_dict = _get_node_value_dict(program)
+    gm_res = QuantFusionPass(_fix_node_meta_val=True, node_value_dict=node_value_dict)(
+        gm
+    )
     gm = gm_res.graph_module
 
     # Do const prop pass to remove packing/dtype conversion ops
diff --git a/exir/program/_program.py b/exir/program/_program.py
index f3d9eef9221..9298eb3e88d 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -11,6 +11,7 @@
 import io
 import logging
 import os
+from collections import defaultdict
 from typing import Any, Dict, List, Optional, Sequence, Set, TextIO, Type, Union
 
 import torch
@@ -1136,7 +1137,7 @@ def keep(op):
 
 
 def _can_skip_using_EDGE_DO_NOT_DECOMP(
-    partitioner: Dict[str, List[Partitioner]], aten_programs: Dict[str, ExportedProgram]
+    partitioner: Partitioner, program: ExportedProgram
 ) -> bool:
     # THe current design of using EDGE_DO_NOT_DECOMP to prevent decomposition
     # has long standing issues.  _remove_invalid_ops_for_not_decompose was a band-aid to
@@ -1144,17 +1145,8 @@ def _can_skip_using_EDGE_DO_NOT_DECOMP(
     # and contiguous views: https://fb.workplace.com/groups/pytorch.edge.users/permalink/1796069037930048/
     # EDGE_DO_NOT_DECOMP is only needed by partitioners that specify check_op_support
     # As a temp fix, we give a more reliable path for backends that do not specify check_op_support
-    can_skip_using_EDGE_DO_NOT_DECOMP = True
-    for name, program in aten_programs.items():
-        if partitioner is not None:
-            for curr_partitioner in partitioner.get(name, []):
-                (
-                    curr_ops_no_decomp,
-                    check_op_support,
-                ) = curr_partitioner.ops_to_not_decompose(program)
-                if check_op_support is not None:
-                    can_skip_using_EDGE_DO_NOT_DECOMP = False
-    return can_skip_using_EDGE_DO_NOT_DECOMP
+    _, check_op_support = partitioner.ops_to_not_decompose(program)
+    return check_op_support is None
 
 
 def _gen_edge_manager_for_partitioners(
@@ -1177,60 +1169,75 @@ def _gen_edge_manager_for_partitioners(
           on nodes with preserved aten targets. They are then replaces with transformed ops to
           keep them through the second pass of decompositions
     """
-    can_skip_using_EDGE_DO_NOT_DECOMP = _can_skip_using_EDGE_DO_NOT_DECOMP(
-        partitioner, aten_programs
-    )
-    ops_set_to_not_decompose_by_program = {}
+    ops_set_to_not_decompose_by_program = defaultdict(list)
     edge_programs: Dict[str, ExportedProgram] = {}
     for name, program in aten_programs.items():
         # Functionalize program before asking partitioners to preserve ops
         program = program.run_decompositions({})
 
         if partitioner is not None:
-            # preserve all ops listed by all partitioners first
-            all_ops_no_decomp = set()
-            all_ops_no_decomp_needing_preservation = []
-            for curr_partitioner in partitioner.get(name, []):
+            partitioners_for_program = partitioner.get(name, [])
+            final_ops_to_preserve = set()
+
+            # Decompose by default if there are no partitioners for the method
+            if not partitioners_for_program:
+                program = program.run_decompositions(_default_decomposition_table())
+
+            # Process each partitioner individually using their specific requirements
+            for curr_partitioner in partitioners_for_program:
                 curr_ops_no_decomp, _ = curr_partitioner.ops_to_not_decompose(program)
-                all_ops_no_decomp |= set(curr_ops_no_decomp)
 
-            # If not using the can_skip_using_EDGE_DO_NOT_DECOMP path, we need to remove invalid ops
-            # Otherwise there will be issues
-            if not can_skip_using_EDGE_DO_NOT_DECOMP:
-                all_ops_no_decomp = _remove_invalid_ops_for_not_decompose(
-                    list(all_ops_no_decomp)
-                )
-                all_ops_no_decomp = set(all_ops_no_decomp)
-
-            # Run default decompositions, except for those in all_ops_no_decomp
-            table = _default_decomposition_table()
-            for op in all_ops_no_decomp:
-                if table.pop(op, None) is not None:
-                    all_ops_no_decomp_needing_preservation.append(op)
-            program = program.run_decompositions(table)
-
-            # Among all the preserved aten ops, use the check_op_fn to do an additional
-            # check on which ops need to be preserved and which ops need to be decomposed
-            # Those which are truly preserved will be replaced with transformed ops
-            if can_skip_using_EDGE_DO_NOT_DECOMP:
-                ops_set_to_not_decompose_by_program[name] = (
-                    all_ops_no_decomp_needing_preservation
-                )
-            else:
-                ops_set_to_not_decompose_by_program[name] = (
-                    _replace_aten_ops_with_transformed_ops(name, program, partitioner)
-                    or []
+                # Check if this partitioner can skip using EDGE_DO_NOT_DECOMP
+                can_skip_using_edge_do_not_decomp = _can_skip_using_EDGE_DO_NOT_DECOMP(
+                    curr_partitioner, program
                 )
 
-        if not can_skip_using_EDGE_DO_NOT_DECOMP:
-            program = program.run_decompositions(_default_decomposition_table())
-            _restore_transformed_ops_to_aten_ops(program)
+                if can_skip_using_edge_do_not_decomp:
+                    # Preserve all ops in curr_ops_no_decomp from decomposition
+                    table = _default_decomposition_table()
+                    ops_needing_preservation = []
+
+                    for op in curr_ops_no_decomp:
+                        if table.pop(op, None) is not None:
+                            ops_needing_preservation.append(op)
+
+                    program = program.run_decompositions(table)
+                    final_ops_to_preserve.update(ops_needing_preservation)
+                else:
+                    # EDGE_DO_NOT_DECOMP path for the partitioner
+                    curr_ops_no_decomp = _remove_invalid_ops_for_not_decompose(
+                        curr_ops_no_decomp
+                    )
+
+                    # Apply decompositions with this partitioner's preserved ops
+                    table = _default_decomposition_table()
+                    for op in curr_ops_no_decomp:
+                        table.pop(op, None)
+
+                    # First pass of decompositions with this partitioner's preserved ops
+                    program = program.run_decompositions(table)
+
+                    # Filter ops using EDGE_DO_NOT_DECOMP
+                    temp_partitioner_dict = {name: [curr_partitioner]}
+                    preserved_ops = (
+                        _replace_aten_ops_with_transformed_ops(
+                            name, program, temp_partitioner_dict
+                        )
+                        or []
+                    )
+                    final_ops_to_preserve.update(preserved_ops)
+
+                    # Second pass of decompositions with this partitioner's preserved ops after filtering
+                    program = program.run_decompositions(_default_decomposition_table())
+
+                    # Restore ops from edge_no_decomp_namespace to aten ops
+                    _restore_transformed_ops_to_aten_ops(program)
+            ops_set_to_not_decompose_by_program[name].extend(final_ops_to_preserve)
 
-        edge_programs[name] = program
         edge_programs[name] = _generate_edge_program(
             config,
             program,
-            preserve_ops=list(ops_set_to_not_decompose_by_program.get(name, [])),
+            preserve_ops=ops_set_to_not_decompose_by_program.get(name, []),
         )
 
     edge_manager = EdgeProgramManager(
@@ -1349,9 +1356,6 @@ def to_edge_transform_and_lower(  # noqa: C901
     elif partitioner is None:
         partitioner = {name: [] for name in aten_programs.keys()}
 
-    can_skip_using_EDGE_DO_NOT_DECOMP = _can_skip_using_EDGE_DO_NOT_DECOMP(
-        partitioner, aten_programs
-    )
     edge_manager = _gen_edge_manager_for_partitioners(
         partitioner, aten_programs, config, constant_methods, generate_etrecord
     )
@@ -1377,7 +1381,8 @@ def to_edge_transform_and_lower(  # noqa: C901
             curr_op_set, check_op_support = curr_partitioner.ops_to_not_decompose(
                 program
             )
-            if not can_skip_using_EDGE_DO_NOT_DECOMP:
+
+            if not _can_skip_using_EDGE_DO_NOT_DECOMP(curr_partitioner, program):
                 curr_op_set = _remove_invalid_ops_for_not_decompose(curr_op_set)
             ops_set_to_not_decompose = ops_set_to_not_decompose.union(curr_op_set)
             _sanity_check_graph_for_non_decomp_ops(
@@ -1681,7 +1686,7 @@ def to_backend(
         return epm
 
     @et_logger("to_executorch")
-    def to_executorch(
+    def to_executorch(  # noqa (FLAKE8) C901
         self,
         config: Optional[ExecutorchBackendConfig] = None,
     ) -> "ExecutorchProgramManager":
@@ -1745,11 +1750,9 @@ def to_executorch(
                 memory_planning_pass = config.memory_planning_pass
             # TODO(jakeszwe): Follow up with compiler on if the deepcopy is necessary and if so how to make it work
             if hasattr(memory_planning_pass, "run"):
-                new_gm_res = memory_planning_pass.run(  # pyre-ignore[16]
-                    new_gm, new_signature
-                )
+                new_gm_res = memory_planning_pass.run(new_gm, new_signature)
             else:
-                new_gm_res = memory_planning_pass(new_gm)  # pyre-ignore[29]
+                new_gm_res = memory_planning_pass(new_gm)
 
             # WARNING: DO NOT ADD ANY MORE PASSES AFTER MEMORY PLANNING PASS.
             # THERE ARE A LOT OF ASSUMPTIONS IN THE STACK THAT MEMORY PLANNING IS THE LAST PASS BEFORE THE EMITTER.
@@ -1758,6 +1761,15 @@ def to_executorch(
 
             _copy_module(program.graph_module, new_gm)
             execution_programs[name] = program
+        # After running memory planning on all entry points we can run the cross entry point memory planning
+        if isinstance(config.memory_planning_pass, dict):
+            for memory_planning_pass in config.memory_planning_pass.values():
+                if hasattr(memory_planning_pass, "run_multimethod"):
+                    memory_planning_pass.run_multimethod()
+        else:
+            memory_planning_pass = config.memory_planning_pass
+            if hasattr(memory_planning_pass, "run_multimethod"):
+                memory_planning_pass.run_multimethod()
 
         et_pm = ExecutorchProgramManager(
             execution_programs,
diff --git a/exir/tensor_layout.py b/exir/tensor_layout.py
new file mode 100644
index 00000000000..f8f77ebeea3
--- /dev/null
+++ b/exir/tensor_layout.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+from executorch.exir.scalar_type import ScalarType
+
+
+# Note: keep this in sync with the TensorLayout definition in
+# executorch/extension/flat_tensor/serialize/flat_tensor.fbs
+@dataclass
+class TensorLayout:
+    scalar_type: ScalarType
+    sizes: List[int]
+    dim_order: List[int]
diff --git a/exir/tests/test_memory_planning.py b/exir/tests/test_memory_planning.py
index 426cc54dc66..ce20de8f820 100644
--- a/exir/tests/test_memory_planning.py
+++ b/exir/tests/test_memory_planning.py
@@ -14,6 +14,7 @@
 
 import torch
 from executorch.exir import ExecutorchBackendConfig, to_edge
+from executorch.exir.capture._capture import patch_forward
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.memory_planning import (
     _do_user_inputs_exist,
@@ -93,6 +94,24 @@ def get_random_inputs(self) -> Tuple[torch.Tensor, ...]:
         return (torch.randn(10), torch.randn(10))
 
 
+class MultiEntryPointStatefulModel(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.register_buffer("state", torch.zeros(2, 2))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.state.add_(x).view(-1) * 2
+
+    def set_state(self, state: torch.Tensor) -> None:
+        self.state.copy_(state)
+
+    def get_state(self) -> torch.Tensor:
+        return self.state
+
+    def get_example_inputs(self) -> Tuple[torch.Tensor, ...]:
+        return (torch.ones(1),)
+
+
 class ModelWithDifferentTensorSizes(torch.nn.Module):
     def __init__(self) -> None:
         super(ModelWithDifferentTensorSizes, self).__init__()
@@ -1081,3 +1100,36 @@ def test_multi_map(self) -> None:
                         verifier.storage_overlap(outer_spec, inner_spec),
                         f"Outer spec {outer_spec.shape=} {outer_spec.dtype=} {outer_spec.lifetime=} and inner spec {inner_spec} have storage overlap",
                     )
+
+    def test_multi_state_plan(self) -> None:
+        eager_module = MultiEntryPointStatefulModel().eval()
+        forward = export(eager_module, eager_module.get_example_inputs())
+        with patch_forward(eager_module, eager_module.get_state):
+            get_state = export(eager_module, ())
+        with patch_forward(eager_module, eager_module.set_state):
+            set_state = export(eager_module, (torch.zeros(1),))
+        edge = to_edge(
+            {"forward": forward, "set_state": set_state, "get_state": get_state}
+        )
+        et = edge.to_executorch(
+            ExecutorchBackendConfig(
+                memory_planning_pass=MemoryPlanningPass(share_mutable_buffers=True),
+                emit_mutable_buffer_names=True,
+            )
+        )
+        et_prog = et.executorch_program
+        count = 0
+        for plan in et_prog.execution_plan:
+            for value in plan.values:
+                if (
+                    hasattr(value.val, "allocation_info")
+                    and value.val.allocation_info is not None
+                    and value.val.allocation_info.memory_id == 2
+                ):
+                    count += 1
+                    self.assertEqual(value.val.allocation_info.memory_offset_low, 0)
+                    self.assertTrue(value.val.extra_tensor_info is not None)
+                    self.assertEqual(
+                        value.val.extra_tensor_info.fully_qualified_name, "state"
+                    )
+        self.assertEqual(count, 3)
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index 716b808b087..14f105e8205 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -24,7 +24,17 @@
 from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
     QuantizationConfig,
 )
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, memory, to_edge
+from executorch.backends.xnnpack.utils.configs import (
+    get_xnnpack_executorch_backend_config,
+)
+
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    memory,
+    to_edge,
+    to_edge_transform_and_lower,
+)
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops, ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.emit import emit_program
@@ -2022,3 +2032,64 @@ def forward(self, x):
         pass_result = constant_prop_pass(edge.exported_program())
         # 1 constant: a (= self.w @ self.cst)
         self.assertEqual(1, len(pass_result.constants))
+
+    def test_constant_prop_pass_zero_stride_tensors(self) -> None:
+        """
+        Test that constant propagation correctly handles tensors with zero strides
+        by converting them to contiguous tensors. Zero-stride tensors can be created
+        by operations like expand() and are not supported by ExecuTorch.
+        """
+
+        class ZeroStrideModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const_param = torch.nn.Parameter(torch.tensor([1.0, 2.0, 3.0]))
+
+            def forward(self, x):
+                unsqueezed = self.const_param.unsqueeze(
+                    1
+                )  # Shape: (3, 1), strides: (1, 1)
+                # expand creates zero-stride tensor
+                expanded = unsqueezed.expand(3, 5)  # Shape: (3, 5), strides: (1, 0)
+
+                # Use the expanded tensor with the input to prevent elimination
+                result = x + expanded.sum()
+                return result
+
+        model = ZeroStrideModel()
+        x = torch.randn(3, 5)
+        exported = torch.export.export(model, (x,))
+
+        # Before constant prop: verify we have the parameter
+        self.assertIn("const_param", exported.state_dict)
+
+        const_prop_result = constant_prop_pass(exported)
+        lowered = to_edge_transform_and_lower(
+            const_prop_result,
+            partitioner=[XnnpackPartitioner()],
+        )
+
+        # Should go through
+        lowered.to_executorch(get_xnnpack_executorch_backend_config([SpecPropPass()]))
+        self.assertGreater(len(const_prop_result.constants), 0)
+
+        # Find the propagated constant tensor
+        prop_tensor = None
+        for constant_name, constant_tensor in const_prop_result.constants.items():
+            if constant_name.startswith("_prop_tensor_constant"):
+                prop_tensor = constant_tensor
+                break
+
+        # Verify the propagated tensor exists and has no zero strides
+        self.assertIsNotNone(prop_tensor)
+        self.assertNotIn(
+            0,
+            prop_tensor.stride(),
+            f"Propagated tensor still has zero stride: {prop_tensor.stride()}",
+        )
+
+        # Verify the tensor is contiguous
+        self.assertTrue(
+            prop_tensor.is_contiguous(),
+            f"Propagated tensor is not contiguous: {prop_tensor.stride()}",
+        )
diff --git a/exir/tests/test_quant_fusion_pass.py b/exir/tests/test_quant_fusion_pass.py
index 3097f09c430..8622fca0bd8 100644
--- a/exir/tests/test_quant_fusion_pass.py
+++ b/exir/tests/test_quant_fusion_pass.py
@@ -14,6 +14,7 @@
 from executorch.exir import EdgeCompileConfig, to_edge
 from executorch.exir.passes.constant_prop_pass import constant_prop_pass
 from executorch.exir.passes.quant_fusion_pass import (
+    _get_node_value_dict,
     quant_fusion_and_const_prop_pass,
     QuantFusionPass,
 )
@@ -36,7 +37,8 @@
 
 from torch.testing import FileCheck
 from torchao.quantization.granularity import PerAxis, PerGroup
-from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
+from torchao.quantization.quant_api import IntxWeightOnlyConfig, MappingType, quantize_
+from torchao.quantization.utils import compute_error
 
 
 class TestQuantFusionPass(unittest.TestCase):
@@ -382,13 +384,22 @@ def forward(self, indices):
             # )
 
     def test_embedding_torchao(self) -> None:
-        for bit_width, use_dtype_variant, test_per_group in zip(
-            [2, 4, 8], [True, False], [True, False]
+        for bit_width, use_dtype_variant, test_per_group, mapping_type in zip(
+            [2, 4, 8],
+            [True, False],
+            [True, False],
+            [MappingType.SYMMETRIC, MappingType.ASYMMETRIC],
         ):
-            self._test_embedding_torchao(bit_width, use_dtype_variant, test_per_group)
+            self._test_embedding_torchao(
+                bit_width, use_dtype_variant, test_per_group, mapping_type
+            )
 
     def _test_embedding_torchao(
-        self, bit_width: int, use_dtype_variant: bool, test_per_group: bool
+        self,
+        bit_width: int,
+        use_dtype_variant: bool,
+        test_per_group: bool,
+        mapping_type: MappingType,
     ) -> None:
         assert bit_width in [2, 4, 8]
         embedding_suffix = f"{bit_width}bit" if bit_width < 8 else "byte"
@@ -410,7 +421,9 @@ def _test_embedding_torchao(
         quantize_(
             model,
             IntxWeightOnlyConfig(
-                weight_dtype=getattr(torch, f"int{bit_width}"), granularity=granularity
+                weight_dtype=getattr(torch, f"int{bit_width}"),
+                granularity=granularity,
+                mapping_type=mapping_type,
             ),
             lambda m, fqn: isinstance(m, torch.nn.Embedding),
         )
@@ -438,7 +451,10 @@ def _test_embedding_torchao(
             m.exported_program().graph_module.code
         )
 
-        m = m.transform([QuantFusionPass(_fix_node_meta_val=True)])
+        node_value_dict = _get_node_value_dict(m.exported_program())
+        m = m.transform(
+            [QuantFusionPass(_fix_node_meta_val=True, node_value_dict=node_value_dict)]
+        )
 
         # After pass, we see packing op and quantized embedding op, but no torchao dequantize op
         FileCheck().check_count(
@@ -457,6 +473,22 @@ def _test_embedding_torchao(
 
         constant_prop_pass(m.exported_program())
 
+        found_embedding_node = False
+        seeking_suffix = embedding_suffix.replace("_", ".")
+        seeking = f"quantized_decomposed::embedding_{seeking_suffix}"
+        for node in m.exported_program().graph.nodes:
+            if node.op == "call_function" and node.target.name() == seeking:
+                found_embedding_node = True
+                if mapping_type == MappingType.SYMMETRIC:
+                    assert (
+                        node.args[2] is None
+                    ), f"Expected zero_point=None for symmetric quantization, but got {node.args[2]}"
+                else:
+                    assert node.args[2] is not None
+        assert (
+            found_embedding_node
+        ), f"Did not find embedding node with target {seeking}"
+
         # After constant prop, we see quantized embedding op, but no packing op
         FileCheck().check_count(
             f"executorch_exir_dialects_edge__ops_quantized_decomposed_embedding_{embedding_suffix}",
@@ -470,7 +502,8 @@ def _test_embedding_torchao(
 
         # Compare numerics
         actual_outputs = m.exported_program().module()(*example_inputs)
-        self.assertTrue(torch.allclose(expected_outputs, actual_outputs))
+        sqnr = compute_error(expected_outputs, actual_outputs)
+        self.assertTrue(sqnr >= 50, f"Got sqnr {sqnr}")
 
         # Can lower to executorch
         exec_prog = m.to_executorch()  # noqa
@@ -488,7 +521,8 @@ def _test_embedding_torchao(
         )
 
         actual_outputs2 = m_copy.exported_program().module()(*example_inputs)
-        self.assertTrue(torch.allclose(expected_outputs, actual_outputs2))
+        sqnr = compute_error(expected_outputs, actual_outputs2)
+        self.assertTrue(sqnr >= 50, f"Got sqnr {sqnr}")
 
         # Can lower to executorch
         exec_prog2 = m_copy.to_executorch()  # noqa
diff --git a/export/TARGETS b/export/TARGETS
index ae41393d883..50afa6db6ed 100644
--- a/export/TARGETS
+++ b/export/TARGETS
@@ -117,9 +117,19 @@ runtime.python_library(
         "target_recipes.py",
     ],
     deps = [
+        ":export_utils",
         "fbsource//third-party/pypi/coremltools:coremltools",
         "//executorch/export:recipe",
         "//executorch/backends/xnnpack/recipes:xnnpack_recipes",
         "//executorch/backends/apple/coreml:coreml_recipes",
+        "//executorch/backends/qualcomm/recipes:qnn_recipes",
+    ]
+)
+
+runtime.python_library(
+    name = "export_utils",
+    srcs = ["utils.py"],
+    deps = [
+        "//caffe2:torch",
     ]
 )
diff --git a/export/export.py b/export/export.py
index 86a932d153c..1e9cdbde7c0 100644
--- a/export/export.py
+++ b/export/export.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -200,7 +201,9 @@ def _build_stages(self, stages: List[StageType]) -> Dict[StageType, Stage]:
                     aten_transform_passes = list(
                         self._export_recipe.aten_transform_passes
                     )
-                stage = TorchExportStage(aten_transform_passes)
+                stage = TorchExportStage(
+                    aten_transform_passes, strict=self._export_recipe.strict
+                )
             elif stage_type == StageType.TO_EDGE_TRANSFORM_AND_LOWER:
                 stage = EdgeTransformAndLowerStage.from_recipe(self._lowering_recipe)
             elif stage_type == StageType.TO_EDGE:
diff --git a/export/recipe.py b/export/recipe.py
index 18f4b8aebb9..4465da51956 100644
--- a/export/recipe.py
+++ b/export/recipe.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -151,6 +152,7 @@ class ExportRecipe:
         executorch_backend_config: Optional backend configuration for ExecuTorch
         pipeline_stages: Optional list of stages to execute, defaults to a standard pipeline.
         mode: Export mode (debug or release)
+        strict: Set the strict flag in the torch export call.
     """
 
     name: Optional[str] = None
@@ -163,6 +165,7 @@ class ExportRecipe:
     executorch_backend_config: Optional[ExecutorchBackendConfig] = None
     pipeline_stages: Optional[List[StageType]] = None
     mode: Mode = Mode.RELEASE
+    strict: bool = True
 
     @classmethod
     def get_recipe(cls, recipe: "RecipeType", **kwargs) -> "ExportRecipe":
diff --git a/export/stages.py b/export/stages.py
index 323b327bfa4..3be801c6a14 100644
--- a/export/stages.py
+++ b/export/stages.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -110,9 +111,11 @@ def __init__(
         aten_transform_passes: Optional[
             List[Callable[[str, ExportedProgram], ExportedProgram]]
         ] = None,
+        strict=True,
     ) -> None:
         super().__init__()
         self._aten_transform_passes = aten_transform_passes
+        self.strict = strict
 
     @property
     def stage_type(self) -> str:
@@ -147,7 +150,7 @@ def run(self, artifact: PipelineArtifact) -> None:
                     model,
                     example_inputs[method_name][0],
                     dynamic_shapes=method_dynamic_shapes,
-                    strict=True,
+                    strict=self.strict,
                 )
 
                 # Apply pre-edge transform passes if available
diff --git a/export/target_recipes.py b/export/target_recipes.py
index 0a5ae9ce754..eac35c08bf7 100644
--- a/export/target_recipes.py
+++ b/export/target_recipes.py
@@ -11,31 +11,14 @@
 selection and combine multiple backends optimally for target hardware.
 """
 
-import sys
+import os
 from typing import Dict, List
 
-if sys.platform != "win32":
-    import coremltools as ct
-    from executorch.backends.apple.coreml.recipes import CoreMLRecipeType
-
-# pyre-ignore
 from executorch.backends.xnnpack.recipes import XNNPackRecipeType
 from executorch.export.recipe import ExportRecipe, RecipeType
-
-
-## IOS Target configs
-# The following list of recipes are not exhaustive for CoreML; refer to CoreMLRecipeType for more detailed recipes.
-IOS_CONFIGS: Dict[str, List[RecipeType]] = (
-    {
-        # pyre-ignore
-        "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32],
-        # pyre-ignore
-        "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16],
-        # pyre-ignore
-        "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC],
-    }
-    if sys.platform != "win32"
-    else {}
+from executorch.export.utils import (
+    is_supported_platform_for_coreml_lowering,
+    is_supported_platform_for_qnn_lowering,
 )
 
 
@@ -46,7 +29,7 @@ def _create_target_recipe(
     Create a combined recipe for a target.
 
     Args:
-        target: Human-readable hardware configuration name
+        target_config: Human-readable hardware configuration name
         recipes: List of backend recipe types to combine
         **kwargs: Additional parameters - each backend will use what it needs
 
@@ -67,7 +50,6 @@ def _create_target_recipe(
                 f"Failed to create {recipe_type.value} recipe for {target_config}: {e}"
             ) from e
 
-    # Combine into single recipe
     if len(backend_recipes) == 1:
         return backend_recipes[0]
 
@@ -100,8 +82,24 @@ def get_ios_recipe(
         recipe = get_ios_recipe('ios-arm64-coreml-int8')
         session = export(model, recipe, example_inputs)
     """
-    if target_config not in IOS_CONFIGS:
-        supported = list(IOS_CONFIGS.keys())
+
+    if not is_supported_platform_for_coreml_lowering():
+        raise ValueError("CoreML is not supported on this platform")
+
+    import coremltools as ct
+    from executorch.backends.apple.coreml.recipes import CoreMLRecipeType
+
+    ios_configs: Dict[str, List[RecipeType]] = {
+        # pyre-ignore
+        "ios-arm64-coreml-fp32": [CoreMLRecipeType.FP32, XNNPackRecipeType.FP32],
+        # pyre-ignore
+        "ios-arm64-coreml-fp16": [CoreMLRecipeType.FP16, XNNPackRecipeType.FP32],
+        # pyre-ignore
+        "ios-arm64-coreml-int8": [CoreMLRecipeType.PT2E_INT8_STATIC],
+    }
+
+    if target_config not in ios_configs:
+        supported = list(ios_configs.keys())
         raise ValueError(
             f"Unsupported iOS configuration: '{target_config}'. "
             f"Supported: {supported}"
@@ -113,5 +111,75 @@ def get_ios_recipe(
         if "minimum_deployment_target" not in kwargs:
             kwargs["minimum_deployment_target"] = ct.target.iOS17
 
-    backend_recipes = IOS_CONFIGS[target_config]
+    backend_recipes = ios_configs[target_config]
+    return _create_target_recipe(target_config, backend_recipes, **kwargs)
+
+
+# Android Recipe
+def get_android_recipe(
+    target_config: str = "android-arm64-snapdragon-fp16", **kwargs
+) -> ExportRecipe:
+    """
+    Get Android-optimized recipe for specified hardware configuration.
+
+    Supported configurations:
+    - 'android-arm64-snapdragon-fp16': QNN fp16 recipe
+
+    Args:
+        target_config: Android configuration string
+        **kwargs: Additional parameters for backend recipes
+
+    Returns:
+        ExportRecipe configured for Android deployment
+
+    Raises:
+        ValueError: If target configuration is not supported
+
+    Example:
+        recipe = get_android_recipe('android-arm64-snapdragon-fp16')
+        session = export(model, recipe, example_inputs)
+    """
+
+    if not is_supported_platform_for_qnn_lowering():
+        raise ValueError(
+            "QNN is not supported or not properly configured on this platform"
+        )
+
+    try:
+        # Qualcomm QNN backend runs QNN sdk download on first use
+        # with a pip install, so wrap it in a try/except
+        # pyre-ignore
+        from executorch.backends.qualcomm.recipes import QNNRecipeType
+
+        # (1) if this is called from a pip install, the QNN SDK will be available
+        # (2) if this is called from a source build, check if qnn is available otherwise, had to run build.sh
+        if os.getenv("QNN_SDK_ROOT", None) is None:
+            raise ValueError(
+                "QNN SDK not found, cannot use QNN recipes. First run `./backends/qualcomm/scripts/build.sh`, if building from source"
+            )
+    except Exception as e:
+        raise ValueError(
+            "QNN backend is not available. Please ensure the Qualcomm backend "
+            "is properly installed and configured, "
+        ) from e
+
+    android_configs: Dict[str, List[RecipeType]] = {
+        # pyre-ignore
+        "android-arm64-snapdragon-fp16": [QNNRecipeType.FP16, XNNPackRecipeType.FP32],
+    }
+
+    if target_config not in android_configs:
+        supported = list(android_configs.keys())
+        raise ValueError(
+            f"Unsupported Android configuration: '{target_config}'. "
+            f"Supported: {supported}"
+        )
+
+    kwargs = kwargs or {}
+
+    if target_config == "android-arm64-snapdragon-fp16":
+        if "soc_model" not in kwargs:
+            kwargs["soc_model"] = "SM8650"
+
+    backend_recipes = android_configs[target_config]
     return _create_target_recipe(target_config, backend_recipes, **kwargs)
diff --git a/export/tests/TARGETS b/export/tests/TARGETS
index 71f28b64df7..7b1578ce508 100644
--- a/export/tests/TARGETS
+++ b/export/tests/TARGETS
@@ -1,4 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_version")
 
 oncall("executorch")
 
@@ -37,11 +38,23 @@ runtime.python_test(
     srcs = [
         "test_target_recipes.py",
     ],
+    env = {
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
+        "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_version()),
+        "HTTP_PROXY": "http://fwdproxy:8080",
+        "HTTPS_PROXY": "http://fwdproxy:8080",
+    },
+    labels = ["long_running"],
     deps = [
         "//executorch/export:lib",
         "//executorch/export:target_recipes",
+        "//executorch/export:export_utils",
         "//executorch/runtime:runtime",
         "//executorch/backends/xnnpack/recipes:xnnpack_recipes",
         "//executorch/backends/apple/coreml:coreml_recipes",
+        "//executorch/backends/qualcomm/recipes:qnn_recipes",
+        "//executorch/examples/models:models",
+        "//executorch/backends/xnnpack/test/tester:tester",
+        "fbsource//third-party/pypi/coremltools:coremltools"
     ]
 )
diff --git a/export/tests/test_target_recipes.py b/export/tests/test_target_recipes.py
index 7a2a7c87342..48f7dfc67db 100644
--- a/export/tests/test_target_recipes.py
+++ b/export/tests/test_target_recipes.py
@@ -7,54 +7,182 @@
 # pyre-strict
 
 import logging
-import sys
+import os
 import unittest
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 from executorch.backends.xnnpack.recipes.xnnpack_recipe_provider import (
     XNNPACKRecipeProvider,
 )
-from executorch.export import export, recipe_registry
-from executorch.export.target_recipes import get_ios_recipe
+from executorch.backends.xnnpack.test.tester import Tester
+from executorch.examples.models import MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
+from executorch.exir.schema import DelegateCall, Program
+from executorch.export import (
+    export,
+    ExportRecipe,
+    ExportSession,
+    recipe_registry,
+    StageType,
+)
+from executorch.export.utils import (
+    is_fbcode,
+    is_supported_platform_for_coreml_lowering,
+    is_supported_platform_for_qnn_lowering,
+)
 from executorch.runtime import Runtime
-
-if sys.platform != "win32":
-    from executorch.backends.apple.coreml.recipes import (  # pyre-ignore
-        CoreMLRecipeProvider,
-    )
+from torch import nn, Tensor
+from torch.testing import FileCheck
+from torchao.quantization.utils import compute_error
 
 
 class TestTargetRecipes(unittest.TestCase):
     """Test target recipes."""
 
+    class Model(torch.nn.Module):
+        def __init__(self) -> None:
+            super().__init__()
+            self.linear1 = torch.nn.Linear(4, 4)
+            self.linear2 = torch.nn.Linear(4, 2)
+
+        def forward(self, x: Tensor, y: Tensor) -> Tensor:
+            a = self.linear1(x)
+            b = a + y
+            c = b - x
+            result = self.linear2(c)
+            return result
+
     def setUp(self) -> None:
         torch._dynamo.reset()
         super().setUp()
         recipe_registry.register_backend_recipe_provider(XNNPACKRecipeProvider())
-        if sys.platform != "win32":
+        if is_supported_platform_for_coreml_lowering():
+            from executorch.backends.apple.coreml.recipes import (  # pyre-ignore
+                CoreMLRecipeProvider,
+            )
+
             # pyre-ignore
             recipe_registry.register_backend_recipe_provider(CoreMLRecipeProvider())
 
+        if is_fbcode() and is_supported_platform_for_qnn_lowering():
+            from executorch.backends.qualcomm.recipes import (  # pyre-ignore
+                QNNRecipeProvider,
+            )
+
+            # pyre-ignore
+            recipe_registry.register_backend_recipe_provider(QNNRecipeProvider())
+        self.model = TestTargetRecipes.Model()
+
     def tearDown(self) -> None:
         super().tearDown()
 
-    @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.")
+    def check_delegated(
+        self, program: Program, expected_backends: Optional[List[str]] = None
+    ) -> None:
+        """Check if the program has been delegated to expected backends."""
+        instructions = program.execution_plan[0].chains[0].instructions
+        assert instructions is not None
+
+        if expected_backends is None:
+            # Just check that there's at least one delegate call
+            self.assertGreater(len(instructions), 0)
+            for instruction in instructions:
+                self.assertIsInstance(instruction.instr_args, DelegateCall)
+        else:
+            # Check for specific backends
+            delegates = program.execution_plan[0].delegates
+            delegate_ids = [delegate.id for delegate in delegates]
+            for expected_backend in expected_backends:
+                self.assertIn(
+                    expected_backend,
+                    delegate_ids,
+                    f"Expected backend {expected_backend} not found in delegates: {delegate_ids}",
+                )
+
+    def check_num_partitions(
+        self, executorch_program: Program, expected_num_partitions: int
+    ) -> None:
+        """Check if the program has the expected number of partitions."""
+        self.assertEqual(
+            len(executorch_program.execution_plan[0].delegates),
+            expected_num_partitions,
+        )
+
+    def _check_lowering_error(
+        self,
+        # pyre-ignore[11]
+        session: ExportSession,
+        example_inputs: List[Tuple[Tensor]],
+        model_name: str,
+        recipe_key: str,
+        atol: float = 1e-3,
+        rtol: float = 1e-3,
+    ) -> None:
+        """Compare original model output with session output using tolerance."""
+        quantized_model = session.get_stage_artifacts()[StageType.QUANTIZE].data[
+            "forward"
+        ]
+        lowered_output = session.run_method("forward", *example_inputs)[0]
+        quantized_output = quantized_model(*example_inputs[0])
+
+        try:
+            Tester._assert_outputs_equal(
+                lowered_output, quantized_output, atol=atol, rtol=rtol
+            )
+            logging.info(
+                f"Tolerance check passed for {model_name} with atol={atol}, rtol={rtol}"
+            )
+        except AssertionError as e:
+            raise AssertionError(
+                f"Model '{model_name}' Recipe: {recipe_key}, tolerance check failed"
+            ) from e
+
+    def _check_quantization_error(
+        self,
+        session: ExportSession,
+        eager_model: nn.Module,
+        example_inputs: List[Tuple[Tensor]],
+        model_name: str,
+        recipe_key: str,
+        sqnr_threshold: float = 20.0,
+    ) -> None:
+        """Compare original model output with session output using SQNR."""
+        eager_output = eager_model(*example_inputs[0])
+
+        # get quantized model from session
+        all_artifacts = session.get_stage_artifacts()
+        quantized_model = all_artifacts[StageType.QUANTIZE].data["forward"]
+        quantized_output = quantized_model(*example_inputs[0])
+
+        error = compute_error(eager_output, quantized_output)
+        logging.info(f"SQNR for {model_name}: {error} dB")
+        self.assertTrue(
+            error > sqnr_threshold,
+            f"Model {model_name}, recipe: {recipe_key} SQNR check failed. Expected > {sqnr_threshold}, got {error}",
+        )
+
+    def _check_delegation_with_filecheck(self, session: ExportSession) -> None:
+        """Check that the lowered module contains expected delegate calls."""
+        all_artifacts = session.get_stage_artifacts()
+        edge_program_manager = all_artifacts[StageType.TO_EDGE_TRANSFORM_AND_LOWER].data
+        lowered_module = edge_program_manager.exported_program().module()
+
+        # Check if model got lowered
+        FileCheck().check("torch.ops.higher_order.executorch_call_delegate").run(
+            lowered_module.code
+        )
+
+    # pyre-ignore
+    @unittest.skipIf(
+        not is_supported_platform_for_coreml_lowering(),
+        "Skip test, coreml lowering not supported",
+    )
     def test_ios_fp32_recipe_with_xnnpack_fallback(self) -> None:
+        from executorch.export.target_recipes import get_ios_recipe
+
         # Linear ops skipped by coreml but handled by xnnpack
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear1 = torch.nn.Linear(4, 4)
-                self.linear2 = torch.nn.Linear(4, 2)
-
-            def forward(self, x, y):
-                a = self.linear1(x)
-                b = a + y
-                c = b - x
-                result = self.linear2(c)
-                return result
-
-        model = Model()
+        model = self.model
         model.eval()
 
         example_inputs = [(torch.randn(2, 4), torch.randn(2, 4))]
@@ -114,65 +242,298 @@ def forward(self, x, y):
             et_output = session.run_method("forward", example_inputs[0])
             logging.info(f"et output {et_output}")
 
-    @unittest.skipIf(sys.platform == "win32", "Core ML is not available on Windows.")
-    def test_ios_quant_recipes(self) -> None:
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear1 = torch.nn.Linear(4, 4)
-                self.linear2 = torch.nn.Linear(4, 2)
-
-            def forward(self, x, y):
-                a = self.linear1(x)
-                b = a + y
-                c = b - x
-                result = self.linear2(c)
-                return result
-
-        model = Model()
-        model.eval()
+    def _test_model_with_target_recipes(
+        self,
+        model_name: str,
+        recipe: ExportRecipe,
+        expected_backend_name: str,
+        eager_model: nn.Module,
+        example_inputs: Tuple[Tensor],
+        recipe_key: str,
+        dynamic_shapes: Optional[Dict[str, Tuple[int, ...]]],
+        atol: Optional[float] = 1e-1,
+        rtol: Optional[float] = 1e-1,
+        sqnr_threshold: Optional[int] = 20,
+    ) -> None:
+        """Test a model with a specific target recipe and expected backend."""
+        logging.info(f"Testing model {model_name} with {expected_backend_name} backend")
+
+        # Export with the provided recipe
+        session = export(
+            model=eager_model,
+            example_inputs=[example_inputs],
+            export_recipe=recipe,
+            dynamic_shapes=dynamic_shapes,
+        )
+        logging.info(f"Exporting done for {model_name}-{recipe_key}")
 
-        example_inputs = [(torch.randn(2, 4), torch.randn(2, 4))]
+        executorch_program = session.get_executorch_program()
+        self.assertIsNotNone(
+            executorch_program,
+            f"ExecuTorch program should not be None for {expected_backend_name}",
+        )
 
-        for recipe in [
-            get_ios_recipe("ios-arm64-coreml-fp16"),
-            get_ios_recipe("ios-arm64-coreml-int8"),
-        ]:
-            # Export the model
-            session = export(
-                model=model, example_inputs=example_inputs, export_recipe=recipe
-            )
+        # Check delegation for the expected backend
+        self.check_delegated(executorch_program, [expected_backend_name])
 
-            # Verify we can create executable
-            executorch_program = session.get_executorch_program()
-            session.print_delegation_info()
+        # Check number of partitions created
+        self.check_num_partitions(executorch_program, 1)
 
-            self.assertIsNotNone(
-                executorch_program, "ExecuTorch program should not be None"
-            )
+        # Run the model if the backend is available
+        et_runtime: Runtime = Runtime.get()
+        backend_registry = et_runtime.backend_registry
 
-            # Assert there is an execution plan
-            self.assertTrue(len(executorch_program.execution_plan) == 1)
+        logging.info(
+            f"backends registered: {et_runtime.backend_registry.registered_backend_names}"
+        )
 
-            # Check number of partitions created
-            self.assertTrue(len(executorch_program.execution_plan[0].delegates) == 1)
+        if backend_registry.is_available(expected_backend_name):
+            logging.info(f"Running with {expected_backend_name} backend")
+            if atol is not None and rtol is not None:
+                self._check_lowering_error(
+                    session,
+                    [example_inputs],
+                    model_name,
+                    recipe_key,
+                    atol=atol,
+                    rtol=rtol,
+                )
+                logging.info(
+                    f"Accuracy checks passed for {model_name} with {expected_backend_name} with atol={atol}, rtol={rtol}"
+                )
+
+            # Test SQNR if specified
+            if sqnr_threshold is not None:
+                self._check_quantization_error(
+                    session,
+                    eager_model,
+                    [example_inputs],
+                    model_name,
+                    recipe_key,
+                    sqnr_threshold=sqnr_threshold,
+                )
+
+                logging.info(
+                    f"SQNR check passed for {model_name} with {expected_backend_name} with sqnr={sqnr_threshold}"
+                )
+
+    @classmethod
+    def _get_model_test_configs(
+        cls,
+    ) -> Dict[str, Dict[str, Tuple[Optional[float], Optional[float], Optional[int]]]]:
+        """Get model-specific test configurations for different recipes."""
+        # Format: {model_name: {target_recipe_name: (atol, rtol, sqnr_threshold)}}
+        # If a model/recipe combination is present in this config, the model will be lowered for that recipe.
+        # A value of `None` for any of atol, rtol, or sqnr_threshold means the corresponding accuracy check will be skipped after lowering.
+        return {
+            "linear": {
+                "ios-arm64-coreml-fp16": (1e-3, 1e-3, 20),
+                "ios-arm64-coreml-int8": (1e-2, 1e-2, 20),
+                "android-arm64-snapdragon-fp16": (1e-3, 1e-3, None),
+            },
+            "add": {
+                "ios-arm64-coreml-fp16": (1e-3, 1e-3, 20),
+                "ios-arm64-coreml-int8": (1e-3, 1e-3, 20),
+                "android-arm64-snapdragon-fp16": (1e-3, 1e-3, None),
+            },
+            "add_mul": {
+                "ios-arm64-coreml-fp16": (1e-3, 1e-3, 20),
+                "ios-arm64-coreml-int8": (1e-3, 1e-3, 20),
+                "android-arm64-snapdragon-fp16": (1e-3, 1e-3, None),
+            },
+            "ic3": {
+                "ios-arm64-coreml-fp16": (1e-1, 1.0, 20),
+                "ios-arm64-coreml-int8": (None, None, None),
+                "android-arm64-snapdragon-fp16": (5e-1, 1e-1, None),
+            },
+            "ic4": {
+                "ios-arm64-coreml-fp16": (1e-1, 1e-1, 20),
+                "ios-arm64-coreml-int8": (None, None, None),
+                "android-arm64-snapdragon-fp16": (None, None, None),
+            },
+            "mv2": {
+                "ios-arm64-coreml-fp16": (5e-2, 5e-2, 20),
+                "ios-arm64-coreml-int8": (2e-1, 2e-1, 20),
+                "android-arm64-snapdragon-fp16": (1e-2, 5e-2, None),
+            },
+            "mv3": {
+                "ios-arm64-coreml-fp16": (2e-1, 2e-1, 20),
+                "ios-arm64-coreml-int8": (None, None, None),
+                "android-arm64-snapdragon-fp16": (None, None, None),
+            },
+            "resnet18": {
+                "ios-arm64-coreml-fp16": (1e-1, 1e-1, 20),
+                "ios-arm64-coreml-int8": (None, None, None),
+                "android-arm64-snapdragon-fp16": (2e-1, 2e-1, None),
+            },
+            "resnet50": {
+                "ios-arm64-coreml-fp16": (1e-2, 1e-2, 20),
+                "ios-arm64-coreml-int8": (None, None, None),
+                "android-arm64-snapdragon-fp16": (5e-1, 2e-1, None),
+            },
+            "vit": {
+                "ios-arm64-coreml-fp16": (None, None, None),  # only lower
+                "ios-arm64-coreml-int8": (None, None, None),  # only lower
+                # Couldn't lower it to qnn
+                # "android-arm64-snapdragon-fp16": (None, None, None),
+            },
+            "w2l": {
+                "ios-arm64-coreml-fp16": (1e-2, 1e-2, 20),
+                "ios-arm64-coreml-int8": (1e-1, 1e-1, 20),
+                "android-arm64-snapdragon-fp16": (1e-2, 1e-2, None),
+            },
+        }
+
+    @classmethod
+    def _get_recipes(cls) -> Dict[str, Tuple[ExportRecipe, str]]:
+        """Get available recipes with their configurations based on platform."""
+        all_recipes: Dict[str, Tuple[ExportRecipe, str]] = {}
+
+        # Add iOS recipes
+        if is_supported_platform_for_coreml_lowering():
+            from executorch.export.target_recipes import get_ios_recipe
+
+            all_recipes = {
+                "ios-arm64-coreml-fp16": (get_ios_recipe(), "CoreMLBackend"),
+                "ios-arm64-coreml-int8": (
+                    get_ios_recipe("ios-arm64-coreml-int8"),
+                    "CoreMLBackend",
+                ),
+            }
+
+        # Add android recipes
+        if is_fbcode() and is_supported_platform_for_qnn_lowering():
+            from executorch.export.target_recipes import get_android_recipe
+
+            all_recipes["android-arm64-snapdragon-fp16"] = (
+                get_android_recipe(),
+                "QnnBackend",
+            )
 
-            # Delegate backend is CoreML
-            self.assertEqual(
-                executorch_program.execution_plan[0].delegates[0].id,
-                "CoreMLBackend",
+        return all_recipes
+
+    def _run_model_with_recipe(
+        self,
+        model_name: str,
+        recipe_key: str,
+        eager_model: nn.Module,
+        example_inputs: Tuple[Tensor],
+        # pyre-ignore
+        dynamic_shapes: Any,
+    ) -> None:
+        model_configs = self._get_model_test_configs()
+        recipes = self._get_recipes()
+
+        if model_name not in model_configs:
+            raise ValueError(f"Model {model_name} not found in test configurations")
+
+        if recipe_key not in recipes:
+            raise ValueError(f"Recipe {recipe_key} not found in recipe configurations")
+
+        recipe_tolerances = model_configs[model_name]
+
+        if recipe_key not in recipe_tolerances:
+            raise ValueError(f"Model {model_name} does not support recipe {recipe_key}")
+
+        atol, rtol, sqnr_threshold = recipe_tolerances[recipe_key]
+        recipe, expected_backend = recipes[recipe_key]
+
+        with torch.no_grad():
+            logging.info(f"Running model {model_name} with recipe {recipe_key}")
+            self._test_model_with_target_recipes(
+                model_name=model_name,
+                recipe=recipe,
+                expected_backend_name=expected_backend,
+                eager_model=eager_model,
+                example_inputs=example_inputs,
+                dynamic_shapes=dynamic_shapes,
+                recipe_key=recipe_key,
+                atol=atol,
+                rtol=rtol,
+                sqnr_threshold=sqnr_threshold,
             )
 
-            # Check number of instructions
-            instructions = executorch_program.execution_plan[0].chains[0].instructions
-            self.assertIsNotNone(instructions)
-            self.assertEqual(len(instructions), 1)
+    def _run_model_with_all_recipes(self, model_name: str) -> None:
+        if model_name not in MODEL_NAME_TO_MODEL:
+            self.skipTest(f"Model {model_name} not found in MODEL_NAME_TO_MODEL")
+            return
 
-            et_runtime: Runtime = Runtime.get()
-            backend_registry = et_runtime.backend_registry
-            logging.info(
-                f"backends registered: {et_runtime.backend_registry.registered_backend_names}"
-            )
-            if backend_registry.is_available("CoreMLBackend"):
-                et_output = session.run_method("forward", example_inputs[0])
-                logging.info(f"et output {et_output}")
+        eager_model, example_inputs, _example_kwarg_inputs, dynamic_shapes = (
+            EagerModelFactory.create_model(*MODEL_NAME_TO_MODEL[model_name])
+        )
+        eager_model = eager_model.eval()
+
+        recipes = self._get_recipes()
+        model_configs = self._get_model_test_configs()
+
+        try:
+            # Pre-filter recipes to only those supported by the model
+            supported_recipes = []
+            for recipe_key in recipes.keys():
+                if (
+                    model_name in model_configs
+                    and recipe_key in model_configs[model_name]
+                ):
+                    supported_recipes.append(recipe_key)
+
+            if not supported_recipes:
+                self.skipTest(f"Model {model_name} has no supported recipes")
+                return
+
+            for recipe_key in supported_recipes:
+                with self.subTest(recipe=recipe_key):
+                    self._run_model_with_recipe(
+                        model_name,
+                        recipe_key,
+                        eager_model,
+                        example_inputs,
+                        dynamic_shapes,
+                    )
+        finally:
+            # Clean up dog.jpg file if it exists
+            if os.path.exists("dog.jpg"):
+                os.remove("dog.jpg")
+
+    def test_linear_model(self) -> None:
+        """Test linear model with all applicable recipes."""
+        self._run_model_with_all_recipes("linear")
+
+    def test_add_model(self) -> None:
+        """Test add model with all applicable recipes."""
+        self._run_model_with_all_recipes("add")
+
+    def test_add_mul_model(self) -> None:
+        """Test add_mul model with all applicable recipes."""
+        self._run_model_with_all_recipes("add_mul")
+
+    def test_ic3_model(self) -> None:
+        """Test ic3 model with all applicable recipes."""
+        self._run_model_with_all_recipes("ic3")
+
+    def test_ic4_model(self) -> None:
+        """Test ic4 model with all applicable recipes."""
+        self._run_model_with_all_recipes("ic4")
+
+    def test_mv2_model(self) -> None:
+        """Test mv2 model with all applicable recipes."""
+        self._run_model_with_all_recipes("mv2")
+
+    def test_mv3_model(self) -> None:
+        """Test mv3 model with all applicable recipes."""
+        self._run_model_with_all_recipes("mv3")
+
+    def test_resnet18_model(self) -> None:
+        """Test resnet18 model with all applicable recipes."""
+        self._run_model_with_all_recipes("resnet18")
+
+    def test_resnet50_model(self) -> None:
+        """Test resnet50 model with all applicable recipes."""
+        self._run_model_with_all_recipes("resnet50")
+
+    def test_vit_model(self) -> None:
+        """Test vit model with all applicable recipes."""
+        self._run_model_with_all_recipes("vit")
+
+    def test_w2l_model(self) -> None:
+        """Test w2l model with all applicable recipes."""
+        self._run_model_with_all_recipes("w2l")
diff --git a/export/utils.py b/export/utils.py
new file mode 100644
index 00000000000..da2c30443c4
--- /dev/null
+++ b/export/utils.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+import logging
+import platform
+
+import torch
+
+
+def is_fbcode() -> bool:
+    return not hasattr(torch.version, "git_version")
+
+
+# Check if lowering for CoreML is supported on the current platform
+def is_supported_platform_for_coreml_lowering() -> bool:
+    system = platform.system()
+    machine = platform.machine().lower()
+
+    # Check for Linux x86_64
+    if system == "Linux" and machine == "x86_64":
+        return True
+
+    # Check for macOS aarch64
+    if system == "Darwin" and machine in ("arm64", "aarch64"):
+        return True
+
+    logging.info(f"Unsupported platform: {system} {machine}")
+
+    return False
+
+
+# Check if lowering for QNN is supported on the current platform
+def is_supported_platform_for_qnn_lowering() -> bool:
+    system = platform.system()
+    machine = platform.machine().lower()
+
+    # Check for Linux x86_64
+    if platform.system().lower() == "linux" and platform.machine().lower() in (
+        "x86_64",
+        "amd64",
+        "i386",
+        "i686",
+    ):
+        return True
+
+    logging.error(f"Unsupported platform for QNN lowering: {system} {machine}")
+    return False
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index e959e6858dc..38b28a1407a 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -18,7 +18,10 @@ endif()
 
 set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 if(NOT ANDROID_PLATFORM)
   set(ANDROID_PLATFORM android-30)
 endif()
@@ -30,7 +33,7 @@ endif()
 # libc++ dependencies are consistent. WARNING # Users need to use the SAME fbjni
 # version here and in app gradle dependency for runtime compatibility!
 if(NOT FBJNI_VERSION)
-  set(FBJNI_VERSION 0.5.1)
+  set(FBJNI_VERSION 0.7.0)
 endif()
 
 set(FBJNI_AAR_URL
@@ -168,21 +171,8 @@ endif()
 
 if(EXECUTORCH_BUILD_LLAMA_JNI)
   target_sources(executorch_jni PRIVATE jni/jni_layer_llama.cpp jni/log.cpp)
-  list(APPEND link_libraries llama_runner)
+  list(APPEND link_libraries extension_llm_runner)
   target_compile_definitions(executorch_jni PUBLIC EXECUTORCH_BUILD_LLAMA_JNI=1)
-  add_subdirectory(
-    ${EXECUTORCH_ROOT}/examples/models/llama/runner
-    ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner
-  )
-
-  target_sources(
-    executorch_jni
-    PRIVATE ${EXECUTORCH_ROOT}/extension/llm/runner/llm_runner_helper.cpp
-  )
-
-  target_include_directories(
-    executorch_jni PRIVATE ${EXECUTORCH_ROOT}/extension/llm/runner
-  )
 
   if(QNN_SDK_ROOT)
     target_sources(
diff --git a/extension/android/build.gradle b/extension/android/build.gradle
index 3a5d42e9838..86e53d5873f 100644
--- a/extension/android/build.gradle
+++ b/extension/android/build.gradle
@@ -6,7 +6,7 @@ allprojects {
             compileSdkVersion = 34
             buildToolsVersion = '33.0.1'
 
-            fbjniJavaOnlyVersion = "0.5.1"
+            fbjniJavaOnlyVersion = "0.7.0"
             soLoaderNativeLoaderVersion = "0.10.5"
         }
 
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
index 7d91cfd1194..0c18d60721e 100644
--- a/extension/android/executorch_android/build.gradle
+++ b/extension/android/executorch_android/build.gradle
@@ -15,6 +15,7 @@ plugins {
 
 def qnnVersion = System.properties['qnnVersion']
 def execuTorchVersion = System.properties['execuTorchVersion']
+def flavor = System.properties['flavor']
 
 android {
     namespace = "org.pytorch.executorch"
@@ -49,7 +50,7 @@ task copyTestRes(type: Exec) {
 }
 
 dependencies {
-    implementation 'com.facebook.fbjni:fbjni:0.5.1'
+    implementation 'com.facebook.fbjni:fbjni:0.7.0'
     implementation 'com.facebook.soloader:nativeloader:0.10.5'
     implementation libs.core.ktx
     testImplementation 'junit:junit:4.12'
@@ -69,7 +70,7 @@ mavenPublishing {
   publishToMavenCentral()
   signAllPublications()
 
-  coordinates("org.pytorch", "executorch-android" + (qnnVersion ? "-qnn" : ""), execuTorchVersion ? execuTorchVersion : "0.7.0-SNAPSHOT")
+  coordinates("org.pytorch", "executorch-android" + (flavor ? "-" + flavor : ""), execuTorchVersion ? execuTorchVersion : "1.0.0-SNAPSHOT")
 
   pom {
     name = "ExecuTorch Android"
diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt
index e269f4aa38f..45476dac43f 100644
--- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt
+++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleE2ETest.kt
@@ -10,7 +10,6 @@ package org.pytorch.executorch
 import android.Manifest
 import android.graphics.Bitmap
 import android.graphics.BitmapFactory
-import androidx.test.InstrumentationRegistry
 import androidx.test.ext.junit.runners.AndroidJUnit4
 import androidx.test.rule.GrantPermissionRule
 import java.io.File
@@ -18,6 +17,7 @@ import java.io.IOException
 import java.net.URISyntaxException
 import org.apache.commons.io.FileUtils
 import org.junit.Assert
+import org.junit.Assert.assertArrayEquals
 import org.junit.Rule
 import org.junit.Test
 import org.junit.runner.RunWith
@@ -70,6 +70,7 @@ class ModuleE2ETest {
 
         val module = Module.load(getTestFilePath("/mv3_xnnpack_fp32.pte"))
         val expectedBackends = arrayOf("XnnpackBackend")
+        assertArrayEquals(expectedBackends, module.getMethodMetadata("forward").backends)
     }
 
     @Test
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java
index e180efcdfca..b7cce18f063 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java
@@ -75,6 +75,7 @@ public class ExecutorchRuntimeException extends RuntimeException {
   }
 
   static class ErrorHelper {
+    private static final boolean ENABLE_READ_LOG_BUFFER_LOGS = true;
     // Reusable StringBuilder instance
     private static final StringBuilder sb = new StringBuilder();
 
@@ -92,23 +93,23 @@ static String formatMessage(int errorCode, String details) {
             .append("] ")
             .append(baseMessage)
             .append(": ")
-            .append(details)
-            .append("\nDetailed Logs:\n");
-
-        try {
-          String[] logEntries = readLogBuffer(); // JNI call
-          formatLogEntries(sb, logEntries);
-        } catch (Exception e) {
-          sb.append("Failed to retrieve detailed logs: ").append(e.getMessage());
+            .append(details);
+        if (ENABLE_READ_LOG_BUFFER_LOGS) {
+          try {
+            String[] logEntries = Module.readLogBufferStatic(); // JNI call
+            if (logEntries != null && logEntries.length > 0) {
+              sb.append("\n Detailed logs:\n");
+            }
+            formatLogEntries(sb, logEntries);
+          } catch (Exception e) {
+            sb.append("Failed to retrieve detailed logs: ").append(e.getMessage());
+          }
         }
 
         return sb.toString();
       }
     }
 
-    // Native JNI method declaration
-    private static native String[] readLogBuffer();
-
     // Append log entries to the provided StringBuilder
     private static void formatLogEntries(StringBuilder sb, String[] logEntries) {
       if (logEntries == null || logEntries.length == 0) {
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
index 5a546eb18bc..6da76bf4b74 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java
@@ -203,7 +203,19 @@ public MethodMetadata getMethodMetadata(String name) {
     if (!mMethodMetadata.containsKey(name)) {
       throw new RuntimeException("method " + name + "does not exist for this module");
     }
-    return mMethodMetadata.get(name);
+
+    MethodMetadata methodMetadata = mMethodMetadata.get(name);
+    if (methodMetadata != null) {
+      methodMetadata.setBackends(getUsedBackends(name));
+    }
+    return methodMetadata;
+  }
+
+  @DoNotStrip
+  private static native String[] readLogBufferStaticNative();
+
+  public static String[] readLogBufferStatic() {
+    return readLogBufferStaticNative();
   }
 
   /** Retrieve the in-memory log buffer, containing the most recent ExecuTorch log entries. */
diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
index 289df5defd9..f135731f26a 100644
--- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
+++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java
@@ -11,6 +11,7 @@
 import com.facebook.jni.HybridData;
 import com.facebook.jni.annotations.DoNotStrip;
 import java.io.File;
+import java.util.List;
 import org.pytorch.executorch.ExecuTorchRuntime;
 import org.pytorch.executorch.annotations.Experimental;
 
@@ -32,14 +33,22 @@ public class LlmModule {
 
   @DoNotStrip
   private static native HybridData initHybrid(
-      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath);
+      int modelType,
+      String modulePath,
+      String tokenizerPath,
+      float temperature,
+      List<String> dataFiles);
 
   /**
    * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
-   * data path.
+   * dataFiles.
    */
   public LlmModule(
-      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+      int modelType,
+      String modulePath,
+      String tokenizerPath,
+      float temperature,
+      List<String> dataFiles) {
     ExecuTorchRuntime runtime = ExecuTorchRuntime.getRuntime();
 
     File modelFile = new File(modulePath);
@@ -50,12 +59,22 @@ public LlmModule(
     if (!tokenizerFile.canRead() || !tokenizerFile.isFile()) {
       throw new RuntimeException("Cannot load tokenizer path " + tokenizerPath);
     }
-    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataPath);
+
+    mHybridData = initHybrid(modelType, modulePath, tokenizerPath, temperature, dataFiles);
+  }
+
+  /**
+   * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and
+   * data path.
+   */
+  public LlmModule(
+      int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) {
+    this(modelType, modulePath, tokenizerPath, temperature, List.of(dataPath));
   }
 
   /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */
   public LlmModule(String modulePath, String tokenizerPath, float temperature) {
-    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, null);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of());
   }
 
   /**
@@ -63,12 +82,12 @@ public LlmModule(String modulePath, String tokenizerPath, float temperature) {
    * path.
    */
   public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) {
-    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, dataPath);
+    this(MODEL_TYPE_TEXT, modulePath, tokenizerPath, temperature, List.of(dataPath));
   }
 
   /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */
   public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) {
-    this(modelType, modulePath, tokenizerPath, temperature, null);
+    this(modelType, modulePath, tokenizerPath, temperature, List.of());
   }
 
   /** Constructs a LLM Module for a model with the given LlmModuleConfig */
diff --git a/extension/android/jni/jni_helper.cpp b/extension/android/jni/jni_helper.cpp
index a8fb2aeddcf..6491524c7ac 100644
--- a/extension/android/jni/jni_helper.cpp
+++ b/extension/android/jni/jni_helper.cpp
@@ -13,17 +13,21 @@ namespace executorch::jni_helper {
 void throwExecutorchException(uint32_t errorCode, const std::string& details) {
   // Get the current JNI environment
   auto env = facebook::jni::Environment::current();
+  if (!env) {
+    return;
+  }
 
-  // Find the Java ExecutorchRuntimeException class
-  static auto exceptionClass = facebook::jni::findClassLocal(
-      "org/pytorch/executorch/ExecutorchRuntimeException");
+  // stable/global class ref — safe to cache
+  static const auto exceptionClass =
+      JExecutorchRuntimeException::javaClassStatic();
 
   // Find the static factory method: makeExecutorchException(int, String)
-  static auto makeExceptionMethod = exceptionClass->getStaticMethod<
-      facebook::jni::local_ref<facebook::jni::JThrowable>(
-          int, facebook::jni::alias_ref<facebook::jni::JString>)>(
-      "makeExecutorchException",
-      "(ILjava/lang/String;)Lorg/pytorch/executorch/ExecutorchRuntimeException;");
+  static auto makeExceptionMethod =
+      exceptionClass
+          ->getStaticMethod<facebook::jni::local_ref<facebook::jni::JThrowable>(
+              int, facebook::jni::alias_ref<facebook::jni::JString>)>(
+              "makeExecutorchException",
+              "(ILjava/lang/String;)Ljava/lang/RuntimeException;");
 
   auto jDetails = facebook::jni::make_jstring(details);
   // Call the factory method to create the exception object
diff --git a/extension/android/jni/jni_helper.h b/extension/android/jni/jni_helper.h
index 996d75581d3..898c1619d9c 100644
--- a/extension/android/jni/jni_helper.h
+++ b/extension/android/jni/jni_helper.h
@@ -23,4 +23,11 @@ namespace executorch::jni_helper {
  */
 void throwExecutorchException(uint32_t errorCode, const std::string& details);
 
+// Define the JavaClass wrapper
+struct JExecutorchRuntimeException
+    : public facebook::jni::JavaClass<JExecutorchRuntimeException> {
+  static constexpr auto kJavaDescriptor =
+      "Lorg/pytorch/executorch/ExecutorchRuntimeException;";
+};
+
 } // namespace executorch::jni_helper
diff --git a/extension/android/jni/jni_layer.cpp b/extension/android/jni/jni_layer.cpp
index e7ef6e62c74..7961ec6c3e2 100644
--- a/extension/android/jni/jni_layer.cpp
+++ b/extension/android/jni/jni_layer.cpp
@@ -389,6 +389,16 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
 
   facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>>
   readLogBuffer() {
+    return readLogBufferUtil();
+  }
+
+  static facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>>
+  readLogBufferStatic(facebook::jni::alias_ref<jclass>) {
+    return readLogBufferUtil();
+  }
+
+  static facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>>
+  readLogBufferUtil() {
 #ifdef __ANDROID__
 
     facebook::jni::local_ref<facebook::jni::JArrayClass<jstring>> ret;
@@ -500,6 +510,8 @@ class ExecuTorchJni : public facebook::jni::HybridClass<ExecuTorchJni> {
         makeNativeMethod("executeNative", ExecuTorchJni::execute),
         makeNativeMethod("loadMethodNative", ExecuTorchJni::load_method),
         makeNativeMethod("readLogBufferNative", ExecuTorchJni::readLogBuffer),
+        makeNativeMethod(
+            "readLogBufferStaticNative", ExecuTorchJni::readLogBufferStatic),
         makeNativeMethod("etdump", ExecuTorchJni::etdump),
         makeNativeMethod("getMethods", ExecuTorchJni::getMethods),
         makeNativeMethod("getUsedBackends", ExecuTorchJni::getUsedBackends),
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 23686f01ee7..a0c90991bf7 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -140,13 +140,13 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature,
-      facebook::jni::alias_ref<jstring> data_path) {
+      facebook::jni::alias_ref<jobject> data_files) {
     return makeCxxInstance(
         model_type_category,
         model_path,
         tokenizer_path,
         temperature,
-        data_path);
+        data_files);
   }
 
   ExecuTorchLlmJni(
@@ -154,7 +154,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       facebook::jni::alias_ref<jstring> model_path,
       facebook::jni::alias_ref<jstring> tokenizer_path,
       jfloat temperature,
-      facebook::jni::alias_ref<jstring> data_path = nullptr) {
+      facebook::jni::alias_ref<jobject> data_files = nullptr) {
     temperature_ = temperature;
 #if defined(ET_USE_THREADPOOL)
     // Reserve 1 thread for the main thread.
@@ -173,18 +173,32 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           model_path->toStdString().c_str(),
           llm::load_tokenizer(tokenizer_path->toStdString()));
     } else if (model_type_category == MODEL_TYPE_CATEGORY_LLM) {
-      std::optional<const std::string> data_path_str = data_path
-          ? std::optional<const std::string>{data_path->toStdString()}
-          : std::nullopt;
+      std::vector<std::string> data_files_vector;
+      if (data_files != nullptr) {
+        // Convert Java List<String> to C++ std::vector<string>
+        auto list_class = facebook::jni::findClassStatic("java/util/List");
+        auto size_method = list_class->getMethod<jint()>("size");
+        auto get_method =
+            list_class->getMethod<facebook::jni::local_ref<jobject>(jint)>(
+                "get");
+
+        jint size = size_method(data_files);
+        for (jint i = 0; i < size; ++i) {
+          auto str_obj = get_method(data_files, i);
+          auto jstr = facebook::jni::static_ref_cast<jstring>(str_obj);
+          data_files_vector.push_back(jstr->toStdString());
+        }
+      }
       runner_ = executorch::extension::llm::create_text_llm_runner(
           model_path->toStdString(),
           llm::load_tokenizer(tokenizer_path->toStdString()),
-          data_path_str);
+          data_files_vector);
 #if defined(EXECUTORCH_BUILD_QNN)
     } else if (model_type_category == MODEL_TYPE_QNN_LLAMA) {
       std::unique_ptr<executorch::extension::Module> module = std::make_unique<
           executorch::extension::Module>(
           model_path->toStdString().c_str(),
+          data_files_set,
           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
       std::string decoder_model = "llama3"; // use llama3 for now
       runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
@@ -192,7 +206,6 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           decoder_model.c_str(),
           model_path->toStdString().c_str(),
           tokenizer_path->toStdString().c_str(),
-          data_path->toStdString().c_str(),
           "");
       model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
 #endif
@@ -268,7 +281,7 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
       for (int i = 0; i < image_size; i++) {
         image_data[i] = image_data_jint[i];
       }
-      llm::Image image_runner{image_data, width, height, channels};
+      llm::Image image_runner{std::move(image_data), width, height, channels};
       prefill_inputs_.emplace_back(
           llm::MultimodalInput{std::move(image_runner)});
     }
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorch+Module.swift b/extension/apple/ExecuTorch/Exported/ExecuTorch+Module.swift
index 11b20000ee1..86e9f7d3cc9 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorch+Module.swift
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorch+Module.swift
@@ -8,7 +8,6 @@
 
 @_exported import ExecuTorch
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension TensorMetadata {
   /// The size of each dimension.
   var shape: [Int] { __shape.map(\.intValue) }
@@ -17,7 +16,6 @@ public extension TensorMetadata {
   var dimensionOrder: [Int] { __dimensionOrder.map(\.intValue) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension MethodMetadata {
   /// The declared input tags.
   var inputValueTags: [ValueTag] {
@@ -49,7 +47,6 @@ public extension MethodMetadata {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Module {
   /// Executes a specific method with the provided input values.
   /// The method is loaded on demand if not already loaded.
@@ -94,7 +91,6 @@ public extension Module {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Module {
   /// Executes a specific method and decodes the outputs into `Output` generic type.
   ///
@@ -177,7 +173,6 @@ public extension Module {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Module {
   /// Sets a single input value for a method at the specified index.
   ///
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift b/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift
index 06637054b5a..55920ce541f 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorch+Tensor.swift
@@ -12,14 +12,12 @@
 ///
 /// - Parameter shape: An array of integers, where each element represents a dimension size.
 /// - Returns: An integer equal to the product of the sizes of all dimensions.
-@available(*, deprecated, message: "This API is experimental.")
 public func elementCount(ofShape shape: [Int]) -> Int {
   __ExecuTorchElementCountOfShape(shape.map(NSNumber.init))
 }
 
 /// A protocol that types conform to in order to be used as tensor element types.
 /// Provides the mapping from the Swift type to the underlying `DataType`.
-@available(*, deprecated, message: "This API is experimental.")
 public protocol Scalar {
   /// The `DataType` corresponding to this scalar type.
   static var dataType: DataType { get }
@@ -27,7 +25,6 @@ public protocol Scalar {
   func asNSNumber() -> NSNumber
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt8: Scalar {
   /// The `DataType` corresponding to `UInt8`, which is `.byte`.
   public static var dataType: DataType { .byte }
@@ -35,7 +32,6 @@ extension UInt8: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int8: Scalar {
   /// The `DataType` corresponding to `Int8`, which is `.char`.
   public static var dataType: DataType { .char }
@@ -43,7 +39,6 @@ extension Int8: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int16: Scalar {
   /// The `DataType` corresponding to `Int16`, which is `.short`.
   public static var dataType: DataType { .short }
@@ -51,7 +46,6 @@ extension Int16: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int32: Scalar {
   /// The `DataType` corresponding to `Int32`, which is `.int`.
   public static var dataType: DataType { .int }
@@ -59,7 +53,6 @@ extension Int32: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int64: Scalar {
   /// The `DataType` corresponding to `Int64`, which is `.long`.
   public static var dataType: DataType { .long }
@@ -67,7 +60,6 @@ extension Int64: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int: Scalar {
   /// The `DataType` corresponding to `Int`, which is `.long`.
   public static var dataType: DataType { .long }
@@ -75,7 +67,6 @@ extension Int: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Float: Scalar {
   /// The `DataType` corresponding to `Float`, which is `.float`.
   public static var dataType: DataType { .float }
@@ -83,7 +74,6 @@ extension Float: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Double: Scalar {
   /// The `DataType` corresponding to `Double`, which is `.double`.
   public static var dataType: DataType { .double }
@@ -91,7 +81,6 @@ extension Double: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Bool: Scalar {
   /// The `DataType` corresponding to `Bool`, which is `.bool`.
   public static var dataType: DataType { .bool }
@@ -99,7 +88,6 @@ extension Bool: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt16: Scalar {
   /// The `DataType` corresponding to `UInt16`.
   public static var dataType: DataType { .uInt16 }
@@ -107,7 +95,6 @@ extension UInt16: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt32: Scalar {
   /// The `DataType` corresponding to `UInt32`.
   public static var dataType: DataType { .uInt32 }
@@ -115,7 +102,6 @@ extension UInt32: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt64: Scalar {
   /// The `DataType` corresponding to `UInt64`.
   public static var dataType: DataType { .uInt64 }
@@ -123,7 +109,6 @@ extension UInt64: Scalar {
   public func asNSNumber() -> NSNumber { NSNumber(value: self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt: Scalar {
   /// The `DataType` corresponding to `UInt`.
   public static var dataType: DataType { .uInt64 }
@@ -132,7 +117,6 @@ extension UInt: Scalar {
 }
 
 /// A type-erasing tensor class for ExecuTorch operations.
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// The shape of the tensor.
   var shape: [Int] { __shape.map(\.intValue) }
@@ -258,7 +242,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates an empty tensor with the specified properties.
   ///
@@ -302,7 +285,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates a tensor filled with the specified scalar value.
   ///
@@ -348,7 +330,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates a tensor filled with ones.
   ///
@@ -390,7 +371,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates a tensor filled with zeros.
   ///
@@ -433,7 +413,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates a tensor with random values uniformly distributed in `[0, 1)`.
   ///
@@ -477,7 +456,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates a tensor with random values from a normal distribution with mean `0` and variance `1`.
   ///
@@ -521,7 +499,6 @@ public extension AnyTensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension AnyTensor {
   /// Creates a tensor with random integers from `low` (inclusive) to `high` (exclusive).
   ///
@@ -581,7 +558,6 @@ public extension AnyTensor {
 ///
 /// This class encapsulates a type-erasing `AnyTensor` instance and provides a variety of
 /// initializers and utility methods to work with tensor data.
-@available(*, deprecated, message: "This API is experimental.")
 public final class Tensor<T: Scalar>: Equatable {
   /// The data type of the tensor's elements.
   public var dataType: DataType { anyTensor.dataType }
@@ -819,7 +795,6 @@ public final class Tensor<T: Scalar>: Equatable {
   public let anyTensor: AnyTensor
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Returns the tensor's elements as an array of scalars.
   ///
@@ -829,7 +804,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates an empty tensor with the specified properties.
   ///
@@ -868,7 +842,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates a tensor filled with the specified scalar value.
   ///
@@ -912,7 +885,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates a tensor filled with ones.
   ///
@@ -950,7 +922,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates a tensor filled with zeros.
   ///
@@ -988,7 +959,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates a tensor with random values uniformly distributed in `[0, 1)`.
   ///
@@ -1027,7 +997,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates a tensor with random values from a normal distribution with mean `0` and variance `1`.
   ///
@@ -1066,7 +1035,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Tensor {
   /// Creates a tensor with random integers from `low` (inclusive) to `high` (exclusive).
   ///
@@ -1117,7 +1085,6 @@ public extension Tensor {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Tensor: CustomStringConvertible {
   public var description: String {
     self.anyTensor.description
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorch+Value.swift b/extension/apple/ExecuTorch/Exported/ExecuTorch+Value.swift
index b00fba87b39..46af073e440 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorch+Value.swift
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorch+Value.swift
@@ -8,7 +8,6 @@
 
 @_exported import ExecuTorch
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension Value {
   /// Creates a `Value` instance encapsulating a `Tensor`.
   ///
@@ -35,20 +34,17 @@ public extension Value {
 
 /// A protocol that provides a uniform way to convert different Swift types
 /// into a `Value`.
-@available(*, deprecated, message: "This API is experimental.")
 public protocol ValueConvertible {
   /// Converts the instance into a `Value`.
   func asValue() -> Value
 }
 
 /// A protocol that provides a uniform way to create an instance from a `Value`.
-@available(*, deprecated, message: "This API is experimental.")
 public protocol ValueConstructible {
   /// Constructs the instance from a `Value`.
   static func from(_ value: Value) throws -> Self
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension ValueConstructible {
   /// Sugar on top of `decode(from:)`
   init(_ value: Value) throws {
@@ -57,13 +53,11 @@ public extension ValueConstructible {
 }
 
 /// A protocol that provides a uniform way to create an instance from an array of `Value`.
-@available(*, deprecated, message: "This API is experimental.")
 public protocol ValueSequenceConstructible {
   /// Constructs the instance from a `Value` array.
   static func from(_ values: [Value]) throws -> Self
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension ValueSequenceConstructible where Self: ValueConstructible {
   public static func from(_ values: [Value]) throws -> Self {
     guard values.count == 1 else { throw Error(code: .invalidType) }
@@ -71,7 +65,6 @@ extension ValueSequenceConstructible where Self: ValueConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 public extension ValueSequenceConstructible {
   /// Sugar on top of `decode(from:)`
   init(_ values: [Value]) throws {
@@ -81,109 +74,91 @@ public extension ValueSequenceConstructible {
 
 // MARK: - ValueConvertible Conformances
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Value: ValueConvertible {
   /// Returns the `Value` itself.
   public func asValue() -> Value { self }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension AnyTensor: ValueConvertible {
   /// Converts the `Tensor` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Tensor: ValueConvertible {
   /// Converts the `Tensor` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension String: ValueConvertible {
   /// Converts the `String` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension NSNumber: ValueConvertible {
   /// Converts the `NSNumber` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt8: ValueConvertible {
   /// Converts the `UInt8` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: Int(self))) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int8: ValueConvertible {
   /// Converts the `Int8` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: Int(self))) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int16: ValueConvertible {
   /// Converts the `Int16` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int32: ValueConvertible {
   /// Converts the `Int32` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int64: ValueConvertible {
   /// Converts the `Int64` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int: ValueConvertible {
   /// Converts the `Int` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Float: ValueConvertible {
   /// Converts the `Float` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Double: ValueConvertible {
   /// Converts the `Double` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Bool: ValueConvertible {
   /// Converts the `Bool` into a `Value`.
   public func asValue() -> Value { Value(self) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt16: ValueConvertible {
   /// Converts the `UInt16` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt32: ValueConvertible {
   /// Converts the `UInt32` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt64: ValueConvertible {
   /// Converts the `UInt64` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt: ValueConvertible {
   /// Converts the `UInt` into a `Value`.
   public func asValue() -> Value { Value(NSNumber(value: self)) }
@@ -191,14 +166,12 @@ extension UInt: ValueConvertible {
 
 // MARK: - ValueConstructible Conformances
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Value: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     value as! Self
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension AnyTensor: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let tensor = value.anyTensor else {
@@ -208,7 +181,6 @@ extension AnyTensor: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Tensor: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let anyTensor = value.anyTensor else {
@@ -221,7 +193,6 @@ extension Tensor: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension String: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let string = value.string else {
@@ -231,7 +202,6 @@ extension String: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension NSNumber: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar as? Self else {
@@ -241,7 +211,6 @@ extension NSNumber: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt8: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -254,7 +223,6 @@ extension UInt8: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int8: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -267,7 +235,6 @@ extension Int8: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int16: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -280,7 +247,6 @@ extension Int16: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int32: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -293,7 +259,6 @@ extension Int32: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int64: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -306,7 +271,6 @@ extension Int64: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Int: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -319,7 +283,6 @@ extension Int: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Float: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard value.isFloat else {
@@ -329,7 +292,6 @@ extension Float: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Double: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard value.isDouble else {
@@ -339,7 +301,6 @@ extension Double: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Bool: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard value.isBoolean else {
@@ -349,7 +310,6 @@ extension Bool: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt16: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -362,7 +322,6 @@ extension UInt16: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt32: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -375,7 +334,6 @@ extension UInt32: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt64: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -388,7 +346,6 @@ extension UInt64: ValueConstructible, ValueSequenceConstructible {
   }
 }
 
-@available(*, deprecated, message: "This API is experimental.")
 extension UInt: ValueConstructible, ValueSequenceConstructible {
   public static func from(_ value: Value) throws -> Self {
     guard let scalar = value.scalar else {
@@ -403,7 +360,6 @@ extension UInt: ValueConstructible, ValueSequenceConstructible {
 
 // MARK: - ValueSequenceConstructible Conformances
 
-@available(*, deprecated, message: "This API is experimental.")
 extension Array: ValueSequenceConstructible where Element: ValueConstructible {
   public static func from(_ values: [Value]) throws -> [Element] {
     return try values.map { try Element.from($0) }
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchError.h b/extension/apple/ExecuTorch/Exported/ExecuTorchError.h
index e53908687b0..32b2f948da9 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchError.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchError.h
@@ -56,7 +56,7 @@ typedef NS_ERROR_ENUM(ExecuTorchErrorDomain, ExecuTorchErrorCode) {
  * @return An NSString containing the error description.
  */
 FOUNDATION_EXPORT
-__attribute__((deprecated("This API is experimental.")))
+NS_RETURNS_RETAINED
 NSString *ExecuTorchErrorDescription(ExecuTorchErrorCode code)
     NS_SWIFT_NAME(ErrorDescription(_:));
 
@@ -68,7 +68,6 @@ NSString *ExecuTorchErrorDescription(ExecuTorchErrorCode code)
  */
 FOUNDATION_EXPORT
 NS_RETURNS_RETAINED
-__attribute__((deprecated("This API is experimental.")))
 NSError *ExecuTorchErrorWithCode(ExecuTorchErrorCode code)
     NS_SWIFT_NAME(Error(code:));
 
@@ -81,7 +80,6 @@ NSError *ExecuTorchErrorWithCode(ExecuTorchErrorCode code)
  */
  FOUNDATION_EXPORT
  NS_RETURNS_RETAINED
- __attribute__((deprecated("This API is experimental.")))
  NSError *ExecuTorchErrorWithCodeAndDescription(ExecuTorchErrorCode code, NSString * __nullable description)
      NS_SWIFT_NAME(Error(code:description:));
 
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchLog.h b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.h
index a71591c7ba7..17d9f339618 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchLog.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.h
@@ -49,6 +49,7 @@ NS_SWIFT_NAME(LogSink)
  * A singleton class for managing log sinks and dispatching log messages.
  */
 NS_SWIFT_NAME(Log)
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLog : NSObject
 
 /// The shared singleton log instance.
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm
index 904647fee81..443a218134c 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchLog.mm
@@ -90,9 +90,9 @@ - (void)logWithLevel:(ExecuTorchLogLevel)level
     [self->_buffer addObject:@{
       @"level" : @(level),
       @"timestamp" : @(timestamp),
-      @"filename" : filename,
+      @"filename" : filename ?: @"(null)",
       @"line" : @(line),
-      @"message" : message
+      @"message" : message ?: @"(null)"
     }];
   });
   for (id<ExecuTorchLogSink> sink in sinks) {
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
index 51d99780da3..9b8400d739f 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.h
@@ -16,7 +16,7 @@ NS_ASSUME_NONNULL_BEGIN
  * and its debug name.
  */
 NS_SWIFT_NAME(TensorMetadata)
-__attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchTensorMetadata : NSObject
 
 /** The size of each dimension. */
@@ -47,7 +47,7 @@ __attribute__((deprecated("This API is experimental.")))
  * per-tensor metadata, buffer sizes, backends, and instruction count.
  */
 NS_SWIFT_NAME(MethodMetadata)
-__attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchMethodMetadata : NSObject
 
 /** The method’s name. */
@@ -120,21 +120,20 @@ typedef NS_ENUM(uint8_t, ExecuTorchVerification) {
  * This class is a facade for loading programs and executing methods within them.
  */
 NS_SWIFT_NAME(Module)
-__attribute__((deprecated("This API is experimental.")))
 @interface ExecuTorchModule : NSObject
 
 /**
  * Initializes a module with a file path, data path and a specified load mode.
  *
  * @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
  * external tensors and external data.
  * @param loadMode A value from ExecuTorchModuleLoadMode that determines the
  * file loading behavior.
  * @return An initialized ExecuTorchModule instance.
  */
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths
                         loadMode:(ExecuTorchModuleLoadMode)loadMode
     NS_DESIGNATED_INITIALIZER;
 
@@ -142,12 +141,12 @@ __attribute__((deprecated("This API is experimental.")))
  * Initializes a module with a file path, data path and a specified load mode.
  *
  * @param filePath A string representing the path to the ExecuTorch program file.
- * @param dataFilePath A string representing the path to a .ptd file with
+ * @param dataFilePaths A list of strings representing paths to .ptd files with
  * external tensors and external data.
  * @return An initialized ExecuTorchModule instance.
  */
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath;
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths;
 
 /**
  * Initializes a module with a file path and a specified load mode.
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
index ce58f2fb21a..69bb59c860e 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchModule.mm
@@ -250,13 +250,20 @@ @implementation ExecuTorchModule {
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths
                         loadMode:(ExecuTorchModuleLoadMode)loadMode {
   self = [super init];
   if (self) {
+    // Convert NSArray<NSString *> to std::vector<std::string>
+    std::vector<std::string> dataFilePathsVector;
+    if (dataFilePaths != nil) {
+      for (NSString *dataFile in dataFilePaths) {
+        dataFilePathsVector.emplace_back(dataFile.UTF8String);
+      }
+    }
     _module = std::make_unique<Module>(
       filePath.UTF8String,
-      dataFilePath.UTF8String,
+      dataFilePathsVector,
       static_cast<Module::LoadMode>(loadMode)
     );
     _inputs = [NSMutableDictionary new];
@@ -266,21 +273,21 @@ - (instancetype)initWithFilePath:(NSString *)filePath
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
-                    dataFilePath:(NSString *)dataFilePath {
+                   dataFilePaths:(NSArray<NSString *> *)dataFilePaths {
   return [self initWithFilePath:filePath
-                   dataFilePath:dataFilePath
+                  dataFilePaths:dataFilePaths
                        loadMode:ExecuTorchModuleLoadModeFile];
 }
 
 - (instancetype)initWithFilePath:(NSString *)filePath
                         loadMode:(ExecuTorchModuleLoadMode)loadMode {
   return [self initWithFilePath:filePath
-                   dataFilePath:@""
+                  dataFilePaths:@[]
                        loadMode:loadMode];
 }
 - (instancetype)initWithFilePath:(NSString *)filePath {
   return [self initWithFilePath:filePath
-                   dataFilePath:@""
+                  dataFilePaths:@[]
                        loadMode:ExecuTorchModuleLoadModeFile];
 }
 
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
index a77ea677013..53d23258b7e 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.h
@@ -68,7 +68,6 @@ typedef NS_ENUM(uint8_t, ExecuTorchShapeDynamism) {
  * @return An NSInteger indicating the size in bytes.
  */
 FOUNDATION_EXPORT
-__attribute__((deprecated("This API is experimental.")))
 NSInteger ExecuTorchSizeOfDataType(ExecuTorchDataType dataType)
     NS_SWIFT_NAME(size(ofDataType:));
 
@@ -79,7 +78,6 @@ NSInteger ExecuTorchSizeOfDataType(ExecuTorchDataType dataType)
  * @return An NSInteger equal to the product of the sizes of all dimensions.
  */
 FOUNDATION_EXPORT
-__attribute__((deprecated("This API is experimental.")))
 NSInteger ExecuTorchElementCountOfShape(NSArray<NSNumber *> *shape)
     NS_REFINED_FOR_SWIFT;
 
@@ -90,7 +88,6 @@ NSInteger ExecuTorchElementCountOfShape(NSArray<NSNumber *> *shape)
  * initializers and utility methods to work with tensor data.
  */
  NS_SWIFT_NAME(AnyTensor)
-__attribute__((deprecated("This API is experimental.")))
 __attribute__((objc_subclassing_restricted))
 @interface ExecuTorchTensor : NSObject<NSCopying>
 
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
index 3a2b640b7d7..3b1c06a5aa0 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchTensor.mm
@@ -129,7 +129,7 @@ - (instancetype)initWithNativeInstance:(void *)nativeInstance {
 - (instancetype)initWithTensor:(ExecuTorchTensor *)otherTensor {
   ET_CHECK(otherTensor);
   auto tensor = make_tensor_ptr(
-    **reinterpret_cast<TensorPtr *>(otherTensor.nativeInstance)
+    *reinterpret_cast<TensorPtr *>(otherTensor.nativeInstance)
   );
   return [self initWithNativeInstance:&tensor];
 }
@@ -271,7 +271,7 @@ - (NSString *)description {
       ET_CHECK_MSG(false, "Unsupported dtype in description");
     }
   } ctx;
-  ET_SWITCH_REALHBBF16_TYPES(
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
     static_cast<ScalarType>(_tensor->scalar_type()),
     ctx,
     "description",
diff --git a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
index 6070d25383c..5f1f588a48c 100644
--- a/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
+++ b/extension/apple/ExecuTorch/Exported/ExecuTorchValue.h
@@ -43,7 +43,6 @@ typedef float ExecuTorchFloatValue
  * a tensor or a scalar. The value’s type is indicated by its tag.
  */
 NS_SWIFT_NAME(Value)
-__attribute__((deprecated("This API is experimental.")))
 __attribute__((objc_subclassing_restricted))
 @interface ExecuTorchValue : NSObject <NSCopying>
 
diff --git a/extension/audio/mel_spectrogram.py b/extension/audio/mel_spectrogram.py
index d8577829ffc..e02b34fc44c 100644
--- a/extension/audio/mel_spectrogram.py
+++ b/extension/audio/mel_spectrogram.py
@@ -213,7 +213,7 @@ def export_processor(model=None, output_file="whisper_preprocess.pte"):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Export WhisperAudioProcessor to ExecutorTorch"
+        description="Export WhisperAudioProcessor to ExecuTorch"
     )
     parser.add_argument(
         "--feature_size",
diff --git a/extension/benchmark/android/benchmark/app/build.gradle.kts b/extension/benchmark/android/benchmark/app/build.gradle.kts
index 4ee7efd1f97..7554164583a 100644
--- a/extension/benchmark/android/benchmark/app/build.gradle.kts
+++ b/extension/benchmark/android/benchmark/app/build.gradle.kts
@@ -42,7 +42,7 @@ android {
 dependencies {
   implementation(files("libs/executorch.aar"))
   implementation("com.facebook.soloader:soloader:0.10.5")
-  implementation("com.facebook.fbjni:fbjni:0.5.1")
+  implementation("com.facebook.fbjni:fbjni:0.7.0")
   implementation("com.google.code.gson:gson:2.8.6")
   implementation("org.json:json:20250107")
   implementation("androidx.core:core-ktx:1.13.1")
diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
index 3c8173d5bff..8a75010af6f 100644
--- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
+++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj
@@ -8,7 +8,7 @@
 
 /* Begin PBXBuildFile section */
 		0314AE3A2E2AAEE700DDE821 /* executorch_llm in Frameworks */ = {isa = PBXBuildFile; productRef = 0314AE392E2AAEE700DDE821 /* executorch_llm */; };
-		032A73CA2CAFBA8600932D36 /* LLaMATests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032A73C82CAFBA8600932D36 /* LLaMATests.mm */; };
+		032A73CA2CAFBA8600932D36 /* LLMTests.mm in Sources */ = {isa = PBXBuildFile; fileRef = 032A73C82CAFBA8600932D36 /* LLMTests.mm */; };
 		0351D9D72CAFC9A200607121 /* Resources in Resources */ = {isa = PBXBuildFile; fileRef = 03C7FA322C8AA24200E6E9AE /* Resources */; };
 		03B0118E2CAC567900054791 /* DynamicTestCase.m in Sources */ = {isa = PBXBuildFile; fileRef = 03B0118C2CAC567900054791 /* DynamicTestCase.m */; };
 		03B011912CAD114E00054791 /* ResourceTestCase.m in Sources */ = {isa = PBXBuildFile; fileRef = 03B011902CAD114E00054791 /* ResourceTestCase.m */; };
@@ -35,7 +35,7 @@
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXFileReference section */
-		032A73C82CAFBA8600932D36 /* LLaMATests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LLaMATests.mm; sourceTree = "<group>"; };
+		032A73C82CAFBA8600932D36 /* LLMTests.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LLMTests.mm; sourceTree = "<group>"; };
 		037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = "<group>"; };
 		03B0118B2CAC567900054791 /* DynamicTestCase.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DynamicTestCase.h; sourceTree = "<group>"; };
 		03B0118C2CAC567900054791 /* DynamicTestCase.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DynamicTestCase.m; sourceTree = "<group>"; };
@@ -76,12 +76,12 @@
 /* End PBXFrameworksBuildPhase section */
 
 /* Begin PBXGroup section */
-		032A73C92CAFBA8600932D36 /* LLaMA */ = {
+		032A73C92CAFBA8600932D36 /* LLM */ = {
 			isa = PBXGroup;
 			children = (
-				032A73C82CAFBA8600932D36 /* LLaMATests.mm */,
+				032A73C82CAFBA8600932D36 /* LLMTests.mm */,
 			);
-			path = LLaMA;
+			path = LLM;
 			sourceTree = "<group>";
 		};
 		03B0118D2CAC567900054791 /* TestUtils */ = {
@@ -127,7 +127,7 @@
 		03B2D3782C8A515C0046936E /* Tests */ = {
 			isa = PBXGroup;
 			children = (
-				032A73C92CAFBA8600932D36 /* LLaMA */,
+				032A73C92CAFBA8600932D36 /* LLM */,
 				03E7E6782CBDC1C900205E71 /* CoreMLTests.mm */,
 				03B2D3792C8A515C0046936E /* GenericTests.mm */,
 				037C96A02C8A570B00B3DF38 /* Tests.xctestplan */,
@@ -241,7 +241,7 @@
 				03B011912CAD114E00054791 /* ResourceTestCase.m in Sources */,
 				03E7E6792CBDCAE900205E71 /* CoreMLTests.mm in Sources */,
 				03B2D37A2C8A515C0046936E /* GenericTests.mm in Sources */,
-				032A73CA2CAFBA8600932D36 /* LLaMATests.mm in Sources */,
+				032A73CA2CAFBA8600932D36 /* LLMTests.mm in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLM/LLMTests.mm
similarity index 81%
rename from extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
rename to extension/benchmark/apple/Benchmark/Tests/LLM/LLMTests.mm
index 0f509f2809c..eac95073d95 100644
--- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm
+++ b/extension/benchmark/apple/Benchmark/Tests/LLM/LLMTests.mm
@@ -42,10 +42,10 @@ - (id)copyWithZone:(NSZone *)zone {
 
 @end
 
-@interface LLaMATests : ResourceTestCase
+@interface LLMTests : ResourceTestCase
 @end
 
-@implementation LLaMATests
+@implementation LLMTests
 
 + (NSArray<NSString *> *)directories {
   return @[
@@ -87,9 +87,10 @@ @implementation LLaMATests
       for (NSUInteger index = 2; specialTokens.count < 256; ++index) {
         [specialTokens addObject:[NSString stringWithFormat:@"<|reserved_special_token_%zu|>", index]];
       }
-      auto __block runner = [[ExecuTorchLLMTextRunner alloc] initWithModelPath:modelPath
-                                                                 tokenizerPath:tokenizerPath
-                                                                 specialTokens:specialTokens];
+      ExecuTorchLLMTextRunner *__block runner =
+          [[ExecuTorchLLMTextRunner alloc] initWithModelPath:modelPath
+                                               tokenizerPath:tokenizerPath
+                                               specialTokens:specialTokens];
       NSError *error;
       BOOL status = [runner loadWithError:&error];
       if (!status) {
@@ -100,12 +101,14 @@ @implementation LLaMATests
       [testCase measureWithMetrics:@[ tokensPerSecondMetric, [XCTClockMetric new], [XCTMemoryMetric new] ]
                             block:^{
                               tokensPerSecondMetric.tokenCount = 0;
-                              BOOL status = [runner generate:@"Once upon a time"
-                                              sequenceLength:50
-                                           withTokenCallback:^(NSString *token) {
-                                tokensPerSecondMetric.tokenCount++;
+                              BOOL status = [runner generateWithPrompt:@"Once upon a time"
+                                                                config:[[ExecuTorchLLMConfig alloc] initWithBlock:^(ExecuTorchLLMConfig *config) {
+                                config.sequenceLength = 50;
+                              }]
+                                                         tokenCallback:^(NSString *token) {
+                                ++tokensPerSecondMetric.tokenCount;
                               }
-                                                       error:NULL];
+                                                                 error:NULL];
                               XCTAssertTrue(status);
                             }];
     },
diff --git a/extension/data_loader/CMakeLists.txt b/extension/data_loader/CMakeLists.txt
index a5e7a0c4a81..b45ba0594e1 100644
--- a/extension/data_loader/CMakeLists.txt
+++ b/extension/data_loader/CMakeLists.txt
@@ -41,7 +41,7 @@ target_compile_options(extension_data_loader PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_data_loader
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/evalue_util/print_evalue.cpp b/extension/evalue_util/print_evalue.cpp
index 32009011012..83d71cffb43 100644
--- a/extension/evalue_util/print_evalue.cpp
+++ b/extension/evalue_util/print_evalue.cpp
@@ -160,7 +160,7 @@ void print_tensor(std::ostream& os, executorch::aten::Tensor tensor) {
     break;
 
   switch (tensor.scalar_type()) {
-    ET_FORALL_REAL_TYPES_AND2(Bool, Half, PRINT_TENSOR_DATA)
+    ET_FORALL_REALHBBF16_TYPES(PRINT_TENSOR_DATA)
     default:
       os << "[<unhandled scalar type " << (int)tensor.scalar_type() << ">]";
   }
diff --git a/extension/evalue_util/test/print_evalue_test.cpp b/extension/evalue_util/test/print_evalue_test.cpp
index 242cb0af224..a7e300ff383 100644
--- a/extension/evalue_util/test/print_evalue_test.cpp
+++ b/extension/evalue_util/test/print_evalue_test.cpp
@@ -154,21 +154,24 @@ TEST(PrintEvalueTest, NaNDouble) {
 
 TEST(PrintEvalueTest, EmptyString) {
   std::string str = "";
-  EValue value(str.c_str(), str.size());
+  ArrayRef<char> str_ref(const_cast<char*>(str.c_str()), str.size());
+  EValue value(&str_ref);
   expect_output(value, "\"\"");
 }
 
 TEST(PrintEvalueTest, BasicString) {
   // No escaping required.
   std::string str = "Test Data";
-  EValue value(str.c_str(), str.size());
+  ArrayRef<char> str_ref(const_cast<char*>(str.c_str()), str.size());
+  EValue value(&str_ref);
   expect_output(value, "\"Test Data\"");
 }
 
 TEST(PrintEvalueTest, EscapedString) {
   // Contains characters that need to be escaped.
   std::string str = "double quote: \" backslash: \\";
-  EValue value(str.c_str(), str.size());
+  ArrayRef<char> str_ref(const_cast<char*>(str.c_str()), str.size());
+  EValue value(&str_ref);
   expect_output(value, "\"double quote: \\\" backslash: \\\\\"");
 }
 
@@ -267,31 +270,38 @@ TEST(PrintEvalueTest, UnelidedBoolLists) {
   // case; the other scalar types use the same underlying code, so they don't
   // need to test this again.
   {
-    EValue value(ArrayRef<bool>(list.data(), static_cast<size_t>(0ul)));
+    ArrayRef<bool> bool_ref(list.data(), static_cast<size_t>(0ul));
+    EValue value(&bool_ref);
     expect_output(value, "(len=0)[]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 1));
+    ArrayRef<bool> bool_ref(list.data(), 1);
+    EValue value(&bool_ref);
     expect_output(value, "(len=1)[True]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 2));
+    ArrayRef<bool> bool_ref(list.data(), 2);
+    EValue value(&bool_ref);
     expect_output(value, "(len=2)[True, False]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 3));
+    ArrayRef<bool> bool_ref(list.data(), 3);
+    EValue value(&bool_ref);
     expect_output(value, "(len=3)[True, False, True]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 4));
+    ArrayRef<bool> bool_ref(list.data(), 4);
+    EValue value(&bool_ref);
     expect_output(value, "(len=4)[True, False, True, False]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 5));
+    ArrayRef<bool> bool_ref(list.data(), 5);
+    EValue value(&bool_ref);
     expect_output(value, "(len=5)[True, False, True, False, True]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 6));
+    ArrayRef<bool> bool_ref(list.data(), 6);
+    EValue value(&bool_ref);
     expect_output(value, "(len=6)[True, False, True, False, True, False]");
   }
 }
@@ -302,16 +312,19 @@ TEST(PrintEvalueTest, ElidedBoolLists) {
 
   {
     // Default edge items is 3, so the shortest elided list length is 7.
-    EValue value(ArrayRef<bool>(list.data(), 7));
+    ArrayRef<bool> bool_ref(list.data(), 7);
+    EValue value(&bool_ref);
     expect_output(value, "(len=7)[True, False, True, ..., True, False, True]");
   }
   {
-    EValue value(ArrayRef<bool>(list.data(), 8));
+    ArrayRef<bool> bool_ref(list.data(), 8);
+    EValue value(&bool_ref);
     expect_output(value, "(len=8)[True, False, True, ..., False, True, False]");
   }
   {
     // Multi-digit length.
-    EValue value(ArrayRef<bool>(list.data(), 10));
+    ArrayRef<bool> bool_ref(list.data(), 10);
+    EValue value(&bool_ref);
     expect_output(
         value, "(len=10)[True, False, True, ..., False, True, False]");
   }
@@ -342,19 +355,19 @@ TEST(PrintEvalueTest, UnelidedIntLists) {
   {
     BoxedEvalueList<int64_t> list(
         wrapped_values.data(), unwrapped_values.data(), 0);
-    EValue value(list);
+    EValue value(&list);
     expect_output(value, "(len=0)[]");
   }
   {
     BoxedEvalueList<int64_t> list(
         wrapped_values.data(), unwrapped_values.data(), 3);
-    EValue value(list);
+    EValue value(&list);
     expect_output(value, "(len=3)[-2, -1, 0]");
   }
   {
     BoxedEvalueList<int64_t> list(
         wrapped_values.data(), unwrapped_values.data(), 6);
-    EValue value(list);
+    EValue value(&list);
     expect_output(value, "(len=6)[-2, -1, 0, 1, 2, 3]");
   }
 }
@@ -392,20 +405,20 @@ TEST(PrintEvalueTest, ElidedIntLists) {
     // Default edge items is 3, so the shortest elided list length is 7.
     BoxedEvalueList<int64_t> list(
         wrapped_values.data(), unwrapped_values.data(), 7);
-    EValue value(list);
+    EValue value(&list);
     expect_output(value, "(len=7)[-4, -3, -2, ..., 0, 1, 2]");
   }
   {
     BoxedEvalueList<int64_t> list(
         wrapped_values.data(), unwrapped_values.data(), 8);
-    EValue value(list);
+    EValue value(&list);
     expect_output(value, "(len=8)[-4, -3, -2, ..., 1, 2, 3]");
   }
   {
     // Multi-digit length.
     BoxedEvalueList<int64_t> list(
         wrapped_values.data(), unwrapped_values.data(), 10);
-    EValue value(list);
+    EValue value(&list);
     expect_output(value, "(len=10)[-4, -3, -2, ..., 3, 4, 5]");
   }
 }
@@ -419,15 +432,18 @@ TEST(PrintEvalueTest, UnelidedDoubleLists) {
   std::array<double, 6> list = {-2.2, -1, 0, INFINITY, NAN, 3.3};
 
   {
-    EValue value(ArrayRef<double>(list.data(), static_cast<size_t>(0ul)));
+    ArrayRef<double> double_ref(list.data(), static_cast<size_t>(0ul));
+    EValue value(&double_ref);
     expect_output(value, "(len=0)[]");
   }
   {
-    EValue value(ArrayRef<double>(list.data(), 3));
+    ArrayRef<double> double_ref(list.data(), 3);
+    EValue value(&double_ref);
     expect_output(value, "(len=3)[-2.2, -1., 0.]");
   }
   {
-    EValue value(ArrayRef<double>(list.data(), 6));
+    ArrayRef<double> double_ref(list.data(), 6);
+    EValue value(&double_ref);
     expect_output(value, "(len=6)[-2.2, -1., 0., inf, nan, 3.3]");
   }
 }
@@ -438,16 +454,19 @@ TEST(PrintEvalueTest, ElidedDoubleLists) {
 
   {
     // Default edge items is 3, so the shortest elided list length is 7.
-    EValue value(ArrayRef<double>(list.data(), 7));
+    ArrayRef<double> double_ref(list.data(), 7);
+    EValue value(&double_ref);
     expect_output(value, "(len=7)[-4.4, -3., -2.2, ..., 0., inf, nan]");
   }
   {
-    EValue value(ArrayRef<double>(list.data(), 8));
+    ArrayRef<double> double_ref(list.data(), 8);
+    EValue value(&double_ref);
     expect_output(value, "(len=8)[-4.4, -3., -2.2, ..., inf, nan, 3.3]");
   }
   {
     // Multi-digit length.
-    EValue value(ArrayRef<double>(list.data(), 10));
+    ArrayRef<double> double_ref(list.data(), 10);
+    EValue value(&double_ref);
     expect_output(value, "(len=10)[-4.4, -3., -2.2, ..., 3.3, 4., 5.5]");
   }
 }
@@ -503,7 +522,7 @@ void expect_tensor_list_output(size_t num_tensors, const char* expected) {
   ASSERT_LE(num_tensors, wrapped_values.size());
   BoxedEvalueList<executorch::aten::Tensor> list(
       wrapped_values.data(), unwrapped_values, num_tensors);
-  EValue value(list);
+  EValue value(&list);
   expect_output(value, expected);
 }
 
@@ -579,7 +598,7 @@ void expect_list_optional_tensor_output(
   ASSERT_LE(num_tensors, wrapped_values.size());
   BoxedEvalueList<std::optional<executorch::aten::Tensor>> list(
       wrapped_values.data(), unwrapped_values, num_tensors);
-  EValue value(list);
+  EValue value(&list);
   expect_output(value, expected);
 }
 
@@ -628,7 +647,8 @@ TEST(PrintEvalueTest, UnknownTag) {
 
 TEST(PrintEvalueTest, EdgeItemsOverride) {
   std::array<double, 7> list = {-3.0, -2.2, -1, 0, 3.3, 4.0, 5.5};
-  EValue value(ArrayRef<double>(list.data(), 7));
+  ArrayRef<double> double_ref(list.data(), 7);
+  EValue value(&double_ref);
 
   {
     // Default edge items is 3, so this should elide.
@@ -653,7 +673,8 @@ TEST(PrintEvalueTest, EdgeItemsOverride) {
 
 TEST(PrintEvalueTest, EdgeItemsDefaults) {
   std::array<double, 7> list = {-3.0, -2.2, -1, 0, 3.3, 4.0, 5.5};
-  EValue value(ArrayRef<double>(list.data(), 7));
+  ArrayRef<double> double_ref(list.data(), 7);
+  EValue value(&double_ref);
 
   {
     // Default edge items is 3, so this should elide.
@@ -680,7 +701,8 @@ TEST(PrintEvalueTest, EdgeItemsDefaults) {
 
 TEST(PrintEvalueTest, EdgeItemsSingleStream) {
   std::array<double, 7> list = {-3.0, -2.2, -1, 0, 3.3, 4.0, 5.5};
-  EValue value(ArrayRef<double>(list.data(), 7));
+  ArrayRef<double> double_ref(list.data(), 7);
+  EValue value(&double_ref);
   std::ostringstream os_before;
 
   // Print to the same stream multiple times, showing that evalue_edge_items
@@ -750,7 +772,8 @@ TEST(PrintEvalueTest, ListWrapping) {
 
   {
     // Should elide by default and print on a single line.
-    EValue value(ArrayRef<double>(list.data(), list.size()));
+    ArrayRef<double> double_ref(list.data(), list.size());
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << value;
@@ -759,7 +782,8 @@ TEST(PrintEvalueTest, ListWrapping) {
   {
     // Exactly the per-line length should not wrap when increasing the number of
     // edge items to disable elision.
-    EValue value(ArrayRef<double>(list.data(), kItemsPerLine));
+    ArrayRef<double> double_ref(list.data(), kItemsPerLine);
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << torch::executor::util::evalue_edge_items(1000) << value;
@@ -768,7 +792,8 @@ TEST(PrintEvalueTest, ListWrapping) {
   }
   {
     // One more than the per-line length should wrap; no elision.
-    EValue value(ArrayRef<double>(list.data(), kItemsPerLine + 1));
+    ArrayRef<double> double_ref(list.data(), kItemsPerLine + 1);
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << torch::executor::util::evalue_edge_items(1000) << value;
@@ -781,7 +806,8 @@ TEST(PrintEvalueTest, ListWrapping) {
   }
   {
     // Exactly twice the per-line length, without elision.
-    EValue value(ArrayRef<double>(list.data(), kItemsPerLine * 2));
+    ArrayRef<double> double_ref(list.data(), kItemsPerLine * 2);
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << torch::executor::util::evalue_edge_items(1000) << value;
@@ -795,7 +821,8 @@ TEST(PrintEvalueTest, ListWrapping) {
   }
   {
     // Exactly one whole line, with elision.
-    EValue value(ArrayRef<double>(list.data(), kItemsPerLine * 3));
+    ArrayRef<double> double_ref(list.data(), kItemsPerLine * 3);
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << torch::executor::util::evalue_edge_items(kItemsPerLine) << value;
@@ -810,7 +837,8 @@ TEST(PrintEvalueTest, ListWrapping) {
   }
   {
     // Edge item count slightly larger than per-line length, with elision.
-    EValue value(ArrayRef<double>(list.data(), kItemsPerLine * 3));
+    ArrayRef<double> double_ref(list.data(), kItemsPerLine * 3);
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << torch::executor::util::evalue_edge_items(kItemsPerLine + 1) << value;
@@ -829,7 +857,8 @@ TEST(PrintEvalueTest, ListWrapping) {
   }
   {
     // Large wrapped, ragged, elided example.
-    EValue value(ArrayRef<double>(list.data(), list.size()));
+    ArrayRef<double> double_ref(list.data(), list.size());
+    EValue value(&double_ref);
 
     std::ostringstream os;
     os << torch::executor::util::evalue_edge_items(33) << value;
@@ -946,7 +975,7 @@ TEST(PrintEvalueTest, WrappedTensorLists) {
   // Demonstrate the formatting when printing a list with multiple tensors.
   BoxedEvalueList<executorch::aten::Tensor> list(
       wrapped_values.data(), unwrapped_values, wrapped_values.size());
-  EValue value(list);
+  EValue value(&list);
 
   std::ostringstream os;
   os << torch::executor::util::evalue_edge_items(15) << value;
diff --git a/extension/flat_tensor/CMakeLists.txt b/extension/flat_tensor/CMakeLists.txt
index ff70bcc9565..9a0ad782ef5 100644
--- a/extension/flat_tensor/CMakeLists.txt
+++ b/extension/flat_tensor/CMakeLists.txt
@@ -31,7 +31,7 @@ target_compile_options(extension_flat_tensor PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_flat_tensor
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp
index 478ce9d63cf..515bfe93c28 100644
--- a/extension/flat_tensor/flat_tensor_data_map.cpp
+++ b/extension/flat_tensor/flat_tensor_data_map.cpp
@@ -55,7 +55,7 @@ Result<const flat_tensor_flatbuffer::NamedData*> get_named_data(
   if (named_data == nullptr) {
     return Error::NotFound;
   }
-  for (int i = 0; i < named_data->size(); i++) {
+  for (flatbuffers::uoffset_t i = 0; i < named_data->size(); ++i) {
     if (key.size() == named_data->Get(i)->key()->size() &&
         std::strncmp(
             named_data->Get(i)->key()->c_str(),
diff --git a/extension/flat_tensor/serialize/TARGETS b/extension/flat_tensor/serialize/TARGETS
index 229f6930f4e..b9ccadf9f23 100644
--- a/extension/flat_tensor/serialize/TARGETS
+++ b/extension/flat_tensor/serialize/TARGETS
@@ -13,6 +13,9 @@ runtime.python_library(
     visibility = [
         "//executorch/...",
     ],
+    deps = [
+        "//executorch/exir:tensor_layout",
+    ]
 )
 
 runtime.python_library(
diff --git a/extension/flat_tensor/serialize/flat_tensor.fbs b/extension/flat_tensor/serialize/flat_tensor.fbs
index abf331697d6..4b71e13e2c4 100644
--- a/extension/flat_tensor/serialize/flat_tensor.fbs
+++ b/extension/flat_tensor/serialize/flat_tensor.fbs
@@ -7,6 +7,8 @@ namespace flat_tensor_flatbuffer;
 file_identifier "FT01";
 file_extension "ptd";
 
+// Note: keep this in sync with the python definition in
+// executorch/exir/tensor_layout.py
 table TensorLayout {
   scalar_type: executorch_flatbuffer.ScalarType;
 
diff --git a/extension/flat_tensor/serialize/flat_tensor_header.cpp b/extension/flat_tensor/serialize/flat_tensor_header.cpp
index b329015e4ce..b055d222465 100644
--- a/extension/flat_tensor/serialize/flat_tensor_header.cpp
+++ b/extension/flat_tensor/serialize/flat_tensor_header.cpp
@@ -14,7 +14,9 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
 
+#if defined(__clang__)
 #pragma clang diagnostic ignored "-Wdeprecated"
+#endif
 
 namespace executorch {
 using runtime::Error;
diff --git a/extension/flat_tensor/serialize/flat_tensor_schema.py b/extension/flat_tensor/serialize/flat_tensor_schema.py
index 53b0fe98ea9..2fcf2c6eb81 100644
--- a/extension/flat_tensor/serialize/flat_tensor_schema.py
+++ b/extension/flat_tensor/serialize/flat_tensor_schema.py
@@ -9,18 +9,11 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
-from executorch.exir.scalar_type import ScalarType
+from executorch.exir.tensor_layout import TensorLayout
 
 # Note: check executorch/extension/data_format/flat_tensor.fbs for explanations of these fields.
 
 
-@dataclass
-class TensorLayout:
-    scalar_type: ScalarType
-    sizes: List[int]
-    dim_order: List[int]
-
-
 @dataclass
 class DataSegment:
     offset: int
diff --git a/extension/flat_tensor/test/test_serialize.py b/extension/flat_tensor/test/test_serialize.py
index 13402e60a65..726a8845c2e 100644
--- a/extension/flat_tensor/test/test_serialize.py
+++ b/extension/flat_tensor/test/test_serialize.py
@@ -22,7 +22,7 @@
 from executorch.exir._serialize.padding import aligned_size
 
 from executorch.exir.schema import ScalarType
-from executorch.extension.flat_tensor.serialize.flat_tensor_schema import TensorLayout
+from executorch.exir.tensor_layout import TensorLayout
 
 from executorch.extension.flat_tensor.serialize.serialize import (
     _deserialize_to_flat_tensor,
diff --git a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
index 2c7bb1f9e2b..b9176cfc826 100644
--- a/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
+++ b/extension/kernel_util/test/make_boxed_from_unboxed_functor_test.cpp
@@ -133,7 +133,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxArrayRef) {
   EValue evalues[2] = {storage[0], storage[1]};
   EValue* values_p[2] = {&evalues[0], &evalues[1]};
   BoxedEvalueList<Tensor> a_box(values_p, storage, 2);
-  EValue boxed_array_ref(a_box);
+  EValue boxed_array_ref(&a_box);
   // prepare out tensor.
   EValue out(tf.zeros({5}));
 
@@ -186,7 +186,7 @@ TEST_F(MakeBoxedFromUnboxedFunctorTest, UnboxOptionalArrayRef) {
   EValue evalues[2] = {EValue(tf.ones({5})), EValue()};
   EValue* values_p[2] = {&evalues[0], &evalues[1]};
   BoxedEvalueList<optional<Tensor>> a_box(values_p, storage, 2);
-  EValue boxed_array_ref(a_box);
+  EValue boxed_array_ref(&a_box);
 
   // prepare out tensor.
   EValue out(tf.zeros({5}));
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h
index 11cdaf63d0b..cef90617a11 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLM.h
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import "ExecuTorchLLMConfig.h"
 #import "ExecuTorchLLMError.h"
 #import "ExecuTorchLLMMultimodalRunner.h"
 #import "ExecuTorchLLMTextRunner.h"
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.h
new file mode 100644
index 00000000000..5ecc9a0f004
--- /dev/null
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ A configuration object for text generation.
+
+ This class wraps the underlying C++ GenerationConfig so that default
+ values and future fields remain a single source of truth in C++.
+*/
+NS_SWIFT_NAME(Config)
+__attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
+@interface ExecuTorchLLMConfig : NSObject<NSCopying>
+
+/** Whether to echo the input prompt in the output. */
+@property(nonatomic, getter=isEchoEnabled) BOOL echoEnabled;
+
+/** Maximum number of new tokens to generate. */
+@property(nonatomic) NSInteger maximumNewTokens;
+
+/** Whether this is a warmup run. */
+@property(nonatomic, getter=isWarming) BOOL warming;
+
+/** Maximum total sequence length. */
+@property(nonatomic) NSInteger sequenceLength;
+
+/** Temperature for sampling. */
+@property(nonatomic) double temperature;
+
+/** Number of BOS tokens to add. */
+@property(nonatomic) NSInteger bosCount;
+
+/** Number of EOS tokens to add. */
+@property(nonatomic) NSInteger eosCount;
+
+/**
+ Initializes a configuration and invokes the block to mutate it.
+
+ @param block  A block that receives the newly initialized configuration.
+ @return An initialized ExecuTorchLLMConfig instance.
+*/
+- (instancetype)initWithBlock:(NS_NOESCAPE void (^)(ExecuTorchLLMConfig *))block
+    NS_SWIFT_NAME(init(_:));
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.mm
new file mode 100644
index 00000000000..911f66e7d65
--- /dev/null
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMConfig.mm
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#import "ExecuTorchLLMConfig.h"
+
+#import <executorch/extension/llm/runner/irunner.h>
+
+using namespace executorch::extension;
+
+@interface ExecuTorchLLMConfig ()
+
+- (const llm::GenerationConfig &)nativeConfig;
+
+@end
+
+@implementation ExecuTorchLLMConfig {
+  std::unique_ptr<llm::GenerationConfig> _config;
+}
+
+@dynamic echoEnabled;
+@dynamic maximumNewTokens;
+@dynamic warming;
+@dynamic sequenceLength;
+@dynamic temperature;
+@dynamic bosCount;
+@dynamic eosCount;
+
+- (instancetype)init {
+  if (self = [super init]) {
+    _config = std::make_unique<llm::GenerationConfig>();
+  }
+  return self;
+}
+
+- (instancetype)initWithBlock:(NS_NOESCAPE void (^)(ExecuTorchLLMConfig *))block {
+  if (self = [self init]) {
+    if (block) {
+      block(self);
+    }
+  }
+  return self;
+}
+
+- (id)copyWithZone:(NSZone *)zone {
+  ExecuTorchLLMConfig *config = [[[self class] allocWithZone:zone] init];
+  *config->_config = *_config;
+  return config;
+}
+
+- (const llm::GenerationConfig &)nativeConfig {
+  return *_config;
+}
+
+- (BOOL)echoEnabled {
+  return _config->echo;
+}
+
+- (void)setEchoEnabled:(BOOL)echoEnabled {
+  _config->echo = echoEnabled;
+}
+
+- (NSInteger)maximumNewTokens {
+  return _config->max_new_tokens;
+}
+
+- (void)setMaximumNewTokens:(NSInteger)maximumNewTokens {
+  _config->max_new_tokens = (int32_t)maximumNewTokens;
+}
+
+- (BOOL)warming {
+  return _config->warming;
+}
+
+- (void)setWarming:(BOOL)warming {
+  _config->warming = warming;
+}
+
+- (NSInteger)sequenceLength {
+  return _config->seq_len;
+}
+
+- (void)setSequenceLength:(NSInteger)sequenceLength {
+  _config->seq_len = (int32_t)sequenceLength;
+}
+
+- (double)temperature {
+  return _config->temperature;
+}
+
+- (void)setTemperature:(double)temperature {
+  _config->temperature = (float)temperature;
+}
+
+- (NSInteger)bosCount {
+  return _config->num_bos;
+}
+
+- (void)setBosCount:(NSInteger)bosCount {
+  _config->num_bos = (int32_t)bosCount;
+}
+
+- (NSInteger)eosCount {
+  return _config->num_eos;
+}
+
+- (void)setEosCount:(NSInteger)eosCount {
+  _config->num_eos = (int32_t)eosCount;
+}
+
+@end
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
index 747286b9ec3..b2e36e0a1f2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#import <Foundation/Foundation.h>
+#import "ExecuTorchLLMConfig.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -26,6 +26,7 @@ typedef NS_ENUM(NSInteger, ExecuTorchLLMMultimodalInputType) {
 */
 NS_SWIFT_NAME(Image)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMImage : NSObject<NSCopying>
 
 /**
@@ -43,11 +44,22 @@ __attribute__((deprecated("This API is experimental.")))
                     channels:(NSInteger)channels
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
+
 @property(nonatomic, readonly) NSInteger width;
+
 @property(nonatomic, readonly) NSInteger height;
+
 @property(nonatomic, readonly) NSInteger channels;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -58,6 +70,7 @@ __attribute__((deprecated("This API is experimental.")))
 */
 NS_SWIFT_NAME(Audio)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMAudio : NSObject<NSCopying>
 
 /**
@@ -75,11 +88,22 @@ __attribute__((deprecated("This API is experimental.")))
                       frames:(NSInteger)frames
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
+
 @property(nonatomic, readonly) NSInteger batchSize;
+
 @property(nonatomic, readonly) NSInteger bins;
+
 @property(nonatomic, readonly) NSInteger frames;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -91,6 +115,7 @@ __attribute__((deprecated("This API is experimental.")))
 */
 NS_SWIFT_NAME(MultimodalInput)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMMultimodalInput : NSObject<NSCopying>
 
 /**
@@ -120,12 +145,15 @@ __attribute__((deprecated("This API is experimental.")))
  @return A retained ExecuTorchLLMMultimodalInput instance of type Audio.
 */
 + (instancetype)inputWithAudio:(ExecuTorchLLMAudio *)audio
-    NS_SWIFT_NAME(init(audio:))
+    NS_SWIFT_NAME(init(_:))
     NS_RETURNS_RETAINED;
 
 @property(nonatomic, readonly) ExecuTorchLLMMultimodalInputType type;
+
 @property(nonatomic, readonly, nullable) NSString *text;
+
 @property(nonatomic, readonly, nullable) ExecuTorchLLMImage *image;
+
 @property(nonatomic, readonly, nullable) ExecuTorchLLMAudio *audio;
 
 + (instancetype)new NS_UNAVAILABLE;
@@ -134,12 +162,13 @@ __attribute__((deprecated("This API is experimental.")))
 @end
 
 /**
- A wrapper class for the C++ llm::MultimodalLLMRunner that provides
+ A wrapper class for the C++ llm::MultimodalRunner that provides
  Objective-C APIs to load models, manage tokenization, accept mixed
  input modalities, generate text sequences, and stop the runner.
 */
 NS_SWIFT_NAME(MultimodalRunner)
 __attribute__((deprecated("This API is experimental.")))
+__attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMMultimodalRunner : NSObject
 
 /**
@@ -169,25 +198,34 @@ __attribute__((deprecated("This API is experimental.")))
 - (BOOL)loadWithError:(NSError **)error;
 
 /**
- Generates text given a list of multimodal inputs, up to a specified sequence length.
- Invokes the provided callback for each generated token.
+ Generates text given a list of multimodal inputs. A default configuration
+ is created and passed to the configuration block for in-place mutation.
+
+ The token callback, if provided, is invoked for each generated token.
 
- @param inputs    An ordered array of multimodal inputs.
- @param seq_len   The maximum number of tokens to generate.
- @param callback  A block called with each generated token as an NSString.
- @param error     On failure, populated with an NSError explaining the issue.
+ @param inputs     An ordered array of multimodal inputs.
+ @param config     A configuration object.
+ @param callback   A block called with each generated token as an NSString.
+ @param error      On failure, populated with an NSError explaining the issue.
  @return YES if generation completes successfully, NO if an error occurred.
 */
-- (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
-   sequenceLength:(NSInteger)seq_len
-withTokenCallback:(nullable void (^)(NSString *))callback
-            error:(NSError **)error;
+- (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
+                    config:(ExecuTorchLLMConfig *)config
+             tokenCallback:(nullable void (^)(NSString *))callback
+                     error:(NSError **)error
+    NS_SWIFT_NAME(generate(_:_:tokenCallback:));
 
 /**
- Stops any ongoing generation and cleans up internal resources.
+ Stop producing new tokens and terminate the current generation process.
 */
 - (void)stop;
 
+/**
+ Remove the prefilled tokens from the KV cache and reset the start position
+ to 0. It also clears the stats for previous runs.
+*/
+- (void)reset;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
index dcc5dc98806..964805053e2 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -15,6 +15,12 @@
 using namespace executorch::extension;
 using namespace executorch::runtime;
 
+@interface ExecuTorchLLMConfig ()
+
+- (const llm::GenerationConfig &)nativeConfig;
+
+@end
+
 @implementation ExecuTorchLLMImage
 
 - (instancetype)initWithData:(NSData *)data
@@ -26,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data
     _width = width;
     _height = height;
     _channels = channels;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _width = width;
+    _height = height;
+    _channels = channels;
+    _isFloat = YES;
   }
   return self;
 }
@@ -47,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data
     _batchSize = batchSize;
     _bins = bins;
     _frames = frames;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _batchSize = batchSize;
+    _bins = bins;
+    _frames = frames;
+    _isFloat = YES;
   }
   return self;
 }
@@ -156,14 +194,15 @@ - (BOOL)loadWithError:(NSError**)error {
   return YES;
 }
 
-- (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
-   sequenceLength:(NSInteger)seq_len
-withTokenCallback:(nullable void (^)(NSString *))callback
-            error:(NSError **)error {
+- (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
+                    config:(ExecuTorchLLMConfig *)config
+             tokenCallback:(nullable void (^)(NSString *))callback
+                     error:(NSError **)error {
   if (![self loadWithError:error]) {
     return NO;
   }
   std::vector<llm::MultimodalInput> nativeInputs;
+  nativeInputs.reserve((size_t)inputs.count);
   for (ExecuTorchLLMMultimodalInput *input in inputs) {
     switch (input.type) {
       case ExecuTorchLLMMultimodalInputTypeText:
@@ -171,13 +210,50 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
         break;
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
-        std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image{
-          .data = std::move(data),
-          .width = (int32_t)image.width,
-          .height = (int32_t)image.height,
-          .channels = (int32_t)image.channels
-        }));
+        if (image.isFloat) {
+          const float *buffer = (const float *)image.data.bytes;
+          size_t elementCount = (size_t)image.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)image.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + image.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        }
+        break;
+      }
+      case ExecuTorchLLMMultimodalInputTypeAudio: {
+        ExecuTorchLLMAudio *audio = input.audio;
+        if (audio.isFloat) {
+          const float *buffer = (const float *)audio.data.bytes;
+          size_t elementCount = (size_t)audio.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + audio.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        }
         break;
       }
       default: {
@@ -192,7 +268,7 @@ - (BOOL)generate:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
   }
   auto status = _runner->generate(
     std::move(nativeInputs),
-    llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
+    config.nativeConfig,
     [callback](const std::string& token) {
       if (callback) {
         callback(@(token.c_str()));
@@ -216,4 +292,10 @@ - (void)stop {
   }
 }
 
+- (void)reset {
+  if (_runner) {
+    _runner->reset();
+  }
+}
+
 @end
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
index b2c628fadf6..50957ee47f5 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.h
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#import <Foundation/Foundation.h>
+#import "ExecuTorchLLMConfig.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -25,12 +25,23 @@ __attribute__((deprecated("This API is experimental.")))
 
  @param modelPath      File system path to the serialized model.
  @param tokenizerPath  File system path to the tokenizer data.
- @param tokens         An array of NSString special tokens to use during tokenization.
+ @return An initialized ExecuTorchLLMTextRunner instance.
+*/
+- (instancetype)initWithModelPath:(NSString *)modelPath
+                    tokenizerPath:(NSString *)tokenizerPath;
+
+/**
+ Initializes a text LLM runner with the given model and tokenizer paths,
+ and a list of special tokens to include in the tokenizer.
+
+ @param modelPath      File system path to the serialized model.
+ @param tokenizerPath  File system path to the tokenizer data.
+ @param specialTokens  An array of NSString special tokens to use during tokenization.
  @return An initialized ExecuTorchLLMTextRunner instance.
 */
 - (instancetype)initWithModelPath:(NSString *)modelPath
                     tokenizerPath:(NSString *)tokenizerPath
-                    specialTokens:(NSArray<NSString *> *)tokens
+                    specialTokens:(NSArray<NSString *> *)specialTokens
     NS_DESIGNATED_INITIALIZER;
 
 /**
@@ -49,25 +60,34 @@ __attribute__((deprecated("This API is experimental.")))
 - (BOOL)loadWithError:(NSError **)error;
 
 /**
- Generates text given an input prompt, up to a specified sequence length.
- Invokes the provided callback for each generated token.
+ Generates text given an input prompt. A default configuration
+ is created and passed to the configuration block for in-place mutation.
+
+ The token callback, if provided, is invoked for each generated token.
 
- @param prompt    The initial text prompt to generate from.
- @param seq_len   The maximum number of tokens to generate.
- @param callback  A block called with each generated token as an NSString.
- @param error     On failure, populated with an NSError explaining the issue.
+ @param prompt     The initial text prompt to generate from.
+ @param config     A configuration object.
+ @param callback   A block called with each generated token as an NSString.
+ @param error      On failure, populated with an NSError explaining the issue.
  @return YES if generation completes successfully, NO if an error occurred.
 */
-- (BOOL)generate:(NSString *)prompt
-   sequenceLength:(NSInteger)seq_len
-withTokenCallback:(nullable void (^)(NSString *))callback
-            error:(NSError **)error;
+- (BOOL)generateWithPrompt:(NSString *)prompt
+                    config:(ExecuTorchLLMConfig *)config
+             tokenCallback:(nullable void (^)(NSString *token))callback
+                     error:(NSError **)error
+    NS_SWIFT_NAME(generate(_:_:tokenCallback:));
 
 /**
- Stops any ongoing generation and cleans up internal resources.
+ Stop producing new tokens and terminate the current generation process.
 */
 - (void)stop;
 
+/**
+ Remove the prefilled tokens from the KV cache and reset the start position
+ to 0. It also clears the stats for previous runs.
+*/
+- (void)reset;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
index ac50b000704..1a6c3f40045 100644
--- a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
+++ b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMTextRunner.mm
@@ -15,6 +15,12 @@
 using namespace executorch::extension;
 using namespace executorch::runtime;
 
+@interface ExecuTorchLLMConfig ()
+
+- (const llm::GenerationConfig &)nativeConfig;
+
+@end
+
 @implementation ExecuTorchLLMTextRunner {
   NSString *_modelPath;
   NSString *_tokenizerPath;
@@ -22,15 +28,22 @@ @implementation ExecuTorchLLMTextRunner {
   std::unique_ptr<llm::TextLLMRunner> _runner;
 }
 
+- (instancetype)initWithModelPath:(NSString*)modelPath
+                    tokenizerPath:(NSString*)tokenizerPath {
+  return [self initWithModelPath:modelPath
+                   tokenizerPath:tokenizerPath
+                   specialTokens:@[]];
+}
+
 - (instancetype)initWithModelPath:(NSString*)modelPath
                     tokenizerPath:(NSString*)tokenizerPath
-                    specialTokens:(NSArray<NSString*>*)tokens {
+                    specialTokens:(NSArray<NSString*>*)specialTokens {
   self = [super init];
   if (self) {
     _modelPath = [modelPath copy];
     _tokenizerPath = [tokenizerPath copy];
     _specialTokens = std::make_unique<std::vector<std::string>>();
-    for (NSString *token in tokens) {
+    for (NSString *token in specialTokens) {
       _specialTokens->emplace_back(token.UTF8String);
     }
   }
@@ -68,16 +81,16 @@ - (BOOL)loadWithError:(NSError**)error {
   return YES;
 }
 
-- (BOOL)generate:(NSString*)prompt
-    sequenceLength:(NSInteger)seq_len
-withTokenCallback:(nullable void (^)(NSString*))callback
-                error:(NSError**)error {
+- (BOOL)generateWithPrompt:(NSString*)prompt
+                    config:(ExecuTorchLLMConfig *)config
+             tokenCallback:(nullable void (^)(NSString*))callback
+                     error:(NSError**)error {
   if (![self loadWithError:error]) {
     return NO;
   }
   auto status = _runner->generate(
     prompt.UTF8String,
-    llm::GenerationConfig{.seq_len = static_cast<int32_t>(seq_len)},
+    config.nativeConfig,
     [callback](const std::string& token) {
       if (callback) {
         callback(@(token.c_str()));
@@ -101,4 +114,10 @@ - (void)stop {
   }
 }
 
+- (void)reset {
+  if (_runner) {
+    _runner->reset();
+  }
+}
+
 @end
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
index 55bcbb0f407..3617245b8f8 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -6,28 +6,244 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+import ExecuTorch
 import ExecuTorchLLM
 import XCTest
 
+extension UIImage {
+  func centerCropped(to sideSize: CGFloat) -> UIImage {
+    precondition(sideSize > 0)
+    let format = UIGraphicsImageRendererFormat.default()
+    format.scale = 1
+    format.opaque = false
+    return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format)
+      .image { _ in
+        let scaleFactor = max(sideSize / size.width, sideSize / size.height)
+        let scaledWidth = size.width * scaleFactor
+        let scaledHeight = size.height * scaleFactor
+        let originX = (sideSize - scaledWidth) / 2
+        let originY = (sideSize - scaledHeight) / 2
+        draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight))
+      }
+  }
+
+  func rgbBytes() -> [UInt8]? {
+    guard let cgImage = cgImage else { return nil }
+    let pixelWidth = Int(cgImage.width)
+    let pixelHeight = Int(cgImage.height)
+    let pixelCount = pixelWidth * pixelHeight
+    let bytesPerPixel = 4
+    let bytesPerRow = pixelWidth * bytesPerPixel
+    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+    guard let context = CGContext(
+      data: &rgbaBuffer,
+      width: pixelWidth,
+      height: pixelHeight,
+      bitsPerComponent: 8,
+      bytesPerRow: bytesPerRow,
+      space: CGColorSpaceCreateDeviceRGB(),
+      bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
+    ) else { return nil }
+
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight))
+
+    var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
+    for pixelIndex in 0..<pixelCount {
+      let sourceIndex = pixelIndex * bytesPerPixel
+      rgbBytes[pixelIndex] = rgbaBuffer[sourceIndex + 0]
+      rgbBytes[pixelIndex + pixelCount] = rgbaBuffer[sourceIndex + 1]
+      rgbBytes[pixelIndex + 2 * pixelCount] = rgbaBuffer[sourceIndex + 2]
+    }
+    return rgbBytes
+  }
+
+  func rgbBytesNormalized(mean: [Float] = [0, 0, 0], std: [Float] = [1, 1, 1]) -> [Float]? {
+    precondition(mean.count == 3 && std.count == 3)
+    precondition(std[0] != 0 && std[1] != 0 && std[2] != 0)
+    guard let rgbBytes = rgbBytes() else { return nil }
+    let pixelCount = rgbBytes.count / 3
+    var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3)
+    for pixelIndex in 0..<pixelCount {
+      rgbBytesNormalized[pixelIndex] =
+        (Float(rgbBytes[pixelIndex]) / 255.0 - mean[0]) / std[0]
+      rgbBytesNormalized[pixelIndex + pixelCount] =
+        (Float(rgbBytes[pixelIndex + pixelCount]) / 255.0 - mean[1]) / std[1]
+      rgbBytesNormalized[pixelIndex + 2 * pixelCount] =
+        (Float(rgbBytes[pixelIndex + 2 * pixelCount]) / 255.0 - mean[2]) / std[2]
+    }
+    return rgbBytesNormalized
+  }
+
+  func asImage(_ sideSize: CGFloat) -> Image {
+    return Image(
+      data: Data(centerCropped(to: sideSize).rgbBytes() ?? []),
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
+  }
+
+  func asNormalizedImage(
+    _ sideSize: CGFloat,
+    mean: [Float] = [0.485, 0.456, 0.406],
+    std: [Float] = [0.229, 0.224, 0.225]
+  ) -> Image {
+    return Image(
+      float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) },
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
+  }
+}
+
 class MultimodalRunnerTest: XCTestCase {
-  func test() {
+  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+  func testGemma() {
+    let chatTemplate = "<start_of_turn>user\n%@<end_of_turn>\n<start_of_turn>model"
+    let userPrompt = "What's on the picture?"
+    let sideSize: CGFloat = 896
+    let sequenceLength = 768
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+          let uiImage = UIImage(contentsOfFile: imagePath) else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+  }
+
+  func testLLaVA() {
+    let chatTemplate = "USER: %@ ASSISTANT: "
+    let userPrompt = "What's on the picture?"
+    let sideSize: CGFloat = 336
+    let sequenceLength = 768
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin") else {
+          let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+          let uiImage = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
-    return
     let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
     var text = ""
 
     do {
-      try runner.generate([MultimodalInput("hello")], sequenceLength: 2) { token in
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+  }
+
+  func testVoxtral() throws {
+    let chatTemplate = "%@[/INST]"
+    let userPrompt = "What is the audio about?"
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "voxtral", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "voxtral_tokenizer_tekken", ofType: "json"),
+          let audioPath = bundle.path(forResource: "voxtral_input_features", ofType: "bin") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var audioData = try Data(contentsOf: URL(fileURLWithPath: audioPath), options: .mappedIfSafe)
+    let floatSize = MemoryLayout<Float>.size
+    guard audioData.count % floatSize == 0 else {
+      XCTFail("Invalid audio data")
+      return
+    }
+    let bins = 128
+    let frames = 3000
+    let batchSize = audioData.count / floatSize / (bins * frames)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput("<s>[INST][BEGIN_AUDIO]"),
+        MultimodalInput(Audio(
+          float: audioData,
+          batchSize: batchSize,
+          bins: bins,
+          frames: frames
+        )),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.maximumNewTokens = 256
+      }) { token in
         text += token
       }
     } catch {
       XCTFail("Failed to generate text with error \(error)")
     }
-    XCTAssertEqual("hello,", text.lowercased())
+    XCTAssertTrue(text.lowercased().contains("tattoo"))
   }
 }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
index 42dbac8ae30..0fa2b59d05d 100644
--- a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
+++ b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -36,10 +36,13 @@ struct SpecialTokens {
 }
 
 class TextRunnerTest: XCTestCase {
-  func test() {
+  let userPrompt = "The capital of France is called"
+  let sequenceLength = 128
+
+  func testLLaMA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
+          let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
@@ -47,12 +50,62 @@ class TextRunnerTest: XCTestCase {
     var text = ""
 
     do {
-      try runner.generate("hello", sequenceLength: 2) { token in
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+  }
+
+  func testPhi4() {
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = TextRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("paris"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate(userPrompt, Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
         text += token
       }
     } catch {
       XCTFail("Failed to generate text with error \(error)")
     }
-    XCTAssertEqual("hello,", text.lowercased())
+    XCTAssertTrue(text.lowercased().contains("paris"))
   }
 }
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.jpg b/extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.jpg
new file mode 100644
index 00000000000..3ac6d116e75
Binary files /dev/null and b/extension/llm/apple/ExecuTorchLLM/__tests__/resources/IMG_0005.jpg differ
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 1678dc80296..2cdfe547430 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -16,9 +16,14 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 endif()
 
-set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
+)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64")
-  list(APPEND _common_compile_options "-march=armv8.2-a+dotprod")
+  list(APPEND _common_compile_options
+       "$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-march=armv8.2-a+dotprod>"
+  )
 endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
@@ -83,7 +88,7 @@ target_compile_options(custom_ops PUBLIC ${_common_compile_options})
 install(
   TARGETS custom_ops
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
 if(EXECUTORCH_BUILD_KERNELS_LLM_AOT)
@@ -135,8 +140,15 @@ if(EXECUTORCH_BUILD_KERNELS_LLM_AOT)
     target_link_libraries(custom_ops_aot_lib PUBLIC pthreadpool cpuinfo)
   endif()
   target_compile_options(
-    custom_ops_aot_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti
-                              -fexceptions ${_common_compile_options}
+    custom_ops_aot_lib
+    PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+           /GR
+           /wd4996>
+           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+           -fPIC
+           -frtti
+           -fexceptions>
+           ${_common_compile_options}
   )
 
   install(
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
index 01000f3564c..da5c3324662 100644
--- a/extension/llm/export/builder.py
+++ b/extension/llm/export/builder.py
@@ -142,9 +142,14 @@ def __init__(
                     {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
                 )
             else:
-                # Two input arguments: tokens and input_pos but input_pos is static shape
+                # Two input arguments: tokens and input_pos but input_pos is static shape.
+
+                # A runtime assertion is added by torch.ops.llama.update_cache requires that
+                # L['tokens'].size()[1] + input_pos[0].item() < self.max_seq_len
+                # This consttaint L['tokens'].size()[1] to be elf.max_seq_len-1
+                # run with TORCH_LOGS=+dynamic for details
                 self.dynamic_shapes = (
-                    {1: torch.export.Dim("token_dim", max=self.max_seq_len)},
+                    {1: torch.export.Dim("token_dim", max=self.max_seq_len - 1)},
                     {"input_pos": {0: 1}},
                 )
 
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index 5d8840b74bd..223e5335994 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -215,9 +215,10 @@ class ExportConfig:
         so_library: Shared library to specify custom quantized operators.
         export_only: Whether to stop right after torch.export() and
             just save the exported .pt2 graph file.
-        foundation_weights_file: configure the foundation weights of a model
-            to be placed in a separate file, external to the PTE. Pass the
-            intended file name here.
+        foundation_weights_file: place the foundation weights of the model into
+            a separate file, external to the PTE. Pass the file name here.
+        lora_weights_file: place the lora weights of the model into a
+            separate file, external to the PTE. Pass the file name here.
     """
 
     max_seq_length: int = 128
@@ -227,6 +228,7 @@ class ExportConfig:
     so_library: Optional[str] = None
     export_only: bool = False
     foundation_weights_file: Optional[str] = None
+    lora_weights_file: Optional[str] = None
 
     def __post_init__(self):
         if self.max_context_length < self.max_seq_length:
@@ -466,6 +468,16 @@ class OpenvinoConfig:
     nncf_compression_group_size: int = 32
 
 
+@dataclass
+class TorchAOKernelsConfig:
+    """
+    Configures the torchao-kernels backend.
+    """
+
+    use_torchao_kernels_linear: bool = False
+    use_torchao_kernels_tied_embedding: bool = False
+
+
 @dataclass
 class BackendConfig:
     """
@@ -479,6 +491,7 @@ class BackendConfig:
     qnn: QNNConfig = field(default_factory=QNNConfig)
     mps: MPSConfig = field(default_factory=MPSConfig)
     openvino: OpenvinoConfig = field(default_factory=OpenvinoConfig)
+    torchao: TorchAOKernelsConfig = field(default_factory=TorchAOKernelsConfig)
 
 
 ################################################################################
@@ -576,6 +589,8 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.export.export_only = args.export_only
         if hasattr(args, "foundation_weights_file"):
             llm_config.export.foundation_weights_file = args.foundation_weights_file
+        if hasattr(args, "lora_weights_file"):
+            llm_config.export.lora_weights_file = args.lora_weights_file
 
         # QuantizationConfig
         if hasattr(args, "quantization_mode"):
@@ -657,6 +672,28 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
         if hasattr(args, "group_size") and args.group_size:
             llm_config.backend.openvino.nncf_compression_group_size = args.group_size
 
+        # TorchAoKernels
+        if any(
+            hasattr(args, a)
+            for a in [
+                "use_torchao_kernels",
+                "use_torchao_kernels_linear",
+                "use_torchao_kernels_tied_embedding",
+            ]
+        ):
+            if hasattr(args, "use_torchao_kernels") and args.use_torchao_kernels:
+                # Enable all conversions if torchao_kernels is specified
+                llm_config.backend.torchao.use_torchao_kernels_linear = True
+                llm_config.backend.torchao.use_torchao_kernels_tied_embedding = True
+            else:
+                # Otherwise, only enable the conversions that are specified
+                llm_config.backend.torchao.use_torchao_kernels_linear = getattr(
+                    args, "use_torchao_kernels_linear", False
+                )
+                llm_config.backend.torchao.use_torchao_kernels_tied_embedding = getattr(
+                    args, "use_torchao_kernels_tied_embedding", False
+                )
+
         # DebugConfig
         if hasattr(args, "profile_memory"):
             llm_config.debug.profile_memory = args.profile_memory
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index f92c59cebd3..592a6666dfa 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -238,9 +238,9 @@ def get_ov_quantizer(
     quantization_params = {}
 
     if quant_config == "4wo":
-        quantization_params["mode"] = QuantizationMode.INT4WO_ASYM
+        quantization_params["mode"] = QuantizationMode.INT4WO_SYM
         quantization_params["group_size"] = group_size
-        quantization_params["ratio"] = 0.8
+        quantization_params["ratio"] = 1
 
     elif quant_config == "8wo":
         quantization_params["mode"] = QuantizationMode.INT8WO_ASYM
diff --git a/extension/llm/export/test/test_builder.py b/extension/llm/export/test/test_builder.py
index 8bf591813ec..7883480c1e7 100644
--- a/extension/llm/export/test/test_builder.py
+++ b/extension/llm/export/test/test_builder.py
@@ -88,7 +88,7 @@ def test_get_dynamic_shape_with_dynamic_shape_enabled_with_kv_cache(self) -> Non
         # Check first element (tokens dimension)
         self.assertIsInstance(result[0], dict)
         self.assertIn(1, result[0])
-        self.assertEqual(result[0][1].max, self.max_seq_len)
+        self.assertEqual(result[0][1].max, self.max_seq_len - 1)
 
         # Check second element (input_pos dimension)
         self.assertIsInstance(result[1], dict)
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
index cf8983db1fb..8d280b4eaf9 100644
--- a/extension/llm/runner/CMakeLists.txt
+++ b/extension/llm/runner/CMakeLists.txt
@@ -79,3 +79,43 @@ install(
 if(BUILD_TESTING)
   add_subdirectory(test)
 endif()
+
+# Python bindings for MultimodalRunner
+if(EXECUTORCH_BUILD_PYBIND)
+  # Create the Python extension module for LLM runners
+  pybind11_add_module(
+    _llm_runner SHARED ${CMAKE_CURRENT_SOURCE_DIR}/pybindings.cpp
+  )
+
+  find_package_torch()
+  find_library(
+    TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
+  )
+  # Link with the extension_llm_runner library and its dependencies
+  target_link_libraries(
+    _llm_runner PRIVATE extension_llm_runner tokenizers::tokenizers
+                        portable_lib ${TORCH_PYTHON_LIBRARY} ${TORCH_LIBRARIES}
+  )
+
+  # Set properties for the Python extension
+  set_target_properties(
+    _llm_runner
+    PROPERTIES POSITION_INDEPENDENT_CODE ON
+               CXX_VISIBILITY_PRESET "hidden"
+               INTERPROCEDURAL_OPTIMIZATION TRUE
+  )
+  if(APPLE)
+    set(RPATH "@loader_path/../../pybindings")
+  else()
+    set(RPATH "$ORIGIN/../../pybindings")
+  endif()
+  set_target_properties(_llm_runner PROPERTIES INSTALL_RPATH ${RPATH})
+  # Add include directories
+  target_include_directories(
+    _llm_runner PRIVATE ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
+  )
+
+  install(TARGETS _llm_runner
+          LIBRARY DESTINATION executorch/extension/llm/runner
+  )
+endif()
diff --git a/extension/llm/runner/README.md b/extension/llm/runner/README.md
index ab8ec8964dd..f295018a6d1 100644
--- a/extension/llm/runner/README.md
+++ b/extension/llm/runner/README.md
@@ -1,6 +1,6 @@
-# LLM Runner Framework for ExecutorTorch
+# LLM Runner Framework for ExecuTorch
 
-This directory contains the LLM Runner framework for ExecutorTorch, providing high-level C++ APIs for running Large Language Models with both text-only and multimodal capabilities.
+This directory contains the LLM Runner framework for ExecuTorch, providing high-level C++ APIs for running Large Language Models with both text-only and multimodal capabilities.
 
 ## Overview
 
@@ -164,6 +164,301 @@ int main() {
 }
 ```
 
+## Python API
+
+The LLM Runner framework provides Python bindings for easy integration with Python applications. The Python API mirrors the C++ interface while providing Pythonic convenience features like torch tensor support and Hugging Face compatibility.
+
+### Installation
+
+Build the Python bindings as part of the ExecuTorch build:
+
+```bash
+# Build from source with Python bindings enabled:
+# In executorch root directory
+bash install_executorch.sh
+```
+
+### Quick Start Examples
+
+#### Basic Multimodal Generation
+
+```python
+from executorch.extension.llm.runner import (
+    GenerationConfig, MultimodalRunner, 
+    make_text_input, make_image_input, make_audio_input
+)
+import torch
+
+# Create a multimodal runner
+runner = MultimodalRunner(
+    model_path="/path/to/model.pte",
+    tokenizer_path="/path/to/tokenizer.bin"
+)
+
+# Create multimodal inputs
+inputs = []
+inputs.append(make_text_input("What do you see in this image?"))
+
+# Add image from torch tensor (supports both CHW and HWC formats)
+image_tensor = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)  # CHW format
+inputs.append(make_image_input(image_tensor))
+
+# Configure generation
+config = GenerationConfig(
+    max_new_tokens=100,
+    temperature=0.7,
+    echo=False
+)
+
+# Generate with streaming output
+def token_callback(token: str):
+    print(token, end='', flush=True)
+
+def stats_callback(stats):
+    print(f"\n[Stats] Generated {stats.num_generated_tokens} tokens")
+    inference_time = stats.inference_end_ms - stats.inference_start_ms
+    if inference_time > 0:
+        tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
+        print(f"[Stats] Speed: {tokens_per_sec:.1f} tokens/sec")
+
+runner.generate(inputs, config, token_callback, stats_callback)
+```
+
+#### Working with Different Input Types
+
+```python
+from executorch.extension.llm.runner import (
+    MultimodalRunner, GenerationConfig,
+    make_text_input, make_token_input, make_image_input, 
+    make_audio_input, make_raw_audio_input
+)
+import torch
+
+runner = MultimodalRunner("model.pte", "tokenizer.bin")
+
+# 1. Text input
+text_input = make_text_input("Analyze this multimodal content:")
+
+# 2. Pre-tokenized input (useful for chat templates)
+token_ids = [1, 15043, 445, 2420]  # Example token IDs
+token_input = make_token_input(token_ids)
+
+# 3. Image input from torch tensor
+# Supports multiple formats: (H,W,C), (C,H,W), (1,H,W,C), (1,C,H,W)
+image_hwc = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8)  # HWC
+image_input = make_image_input(image_hwc)
+
+# Float tensors also supported for normalized images
+image_float = torch.rand(3, 224, 224, dtype=torch.float32)  # CHW, normalized
+image_input_float = make_image_input(image_float)
+
+# 4. Preprocessed audio input (e.g., mel spectrograms)
+audio_features = torch.rand(1, 80, 100, dtype=torch.float32)  # (batch, n_bins, n_frames)
+audio_input = make_audio_input(audio_features)
+
+# 5. Raw audio input (for models with built-in audio processing)
+raw_audio = torch.randint(0, 255, (1, 1, 16000), dtype=torch.uint8)  # (batch, channels, samples)
+raw_audio_input = make_raw_audio_input(raw_audio)
+
+# Combine inputs and generate
+inputs = [text_input, image_input, audio_input]
+config = GenerationConfig(max_new_tokens=50, temperature=0.8)
+response = runner.generate_text(inputs, config)
+print(f"Response: {response}")
+```
+
+#### Hugging Face Integration
+
+```python
+from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
+from transformers import AutoProcessor
+from PIL import Image
+import torch
+
+# Load HF processor for your model
+processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+# Create runner
+runner = MultimodalRunner("llava_model.pte", "tokenizer.bin")
+
+# Process inputs with HF processor
+image = Image.open("photo.jpg")
+conversation = [
+    {"role": "user", "content": [
+        {"type": "text", "text": "What's in this image?"},
+        {"type": "image"}
+    ]}
+]
+
+# Apply chat template and process
+prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+inputs_hf = processor(prompt, image, return_tensors="pt")
+
+# Generate using HF inputs directly
+config = GenerationConfig(max_new_tokens=100, temperature=0.7)
+runner.generate_hf(
+    inputs_hf, 
+    config, 
+    image_token_id=processor.tokenizer.convert_tokens_to_ids("<image>"),
+    token_callback=lambda token: print(token, end='', flush=True)
+)
+```
+
+#### Chat Session with State Management
+
+```python
+from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig, make_text_input
+
+class ChatSession:
+    def __init__(self, model_path: str, tokenizer_path: str):
+        self.runner = MultimodalRunner(model_path, tokenizer_path)
+        self.config = GenerationConfig(max_new_tokens=150, temperature=0.7, echo=False)
+        
+    def send_message(self, message: str) -> str:
+        """Send a message and get response"""
+        inputs = [make_text_input(message)]
+        response = self.runner.generate_text(inputs, self.config)
+        return response
+        
+    def send_multimodal(self, text: str, image_tensor: torch.Tensor) -> str:
+        """Send text + image and get response"""
+        inputs = [
+            make_text_input(text),
+            make_image_input(image_tensor)
+        ]
+        response = self.runner.generate_text(inputs, self.config)
+        return response
+        
+    def reset_conversation(self):
+        """Reset the conversation state"""
+        self.runner.reset()
+
+# Usage
+chat = ChatSession("model.pte", "tokenizer.bin")
+print(chat.send_message("Hello! How are you?"))
+
+# Continue conversation (KV cache maintains context)
+print(chat.send_message("What's the weather like?"))
+
+# Reset when starting new conversation
+chat.reset_conversation()
+```
+
+### Python API Classes
+
+#### GenerationConfig
+```python
+from executorch.extension.llm.runner import GenerationConfig
+
+# Create with defaults
+config = GenerationConfig()
+
+# Or specify parameters
+config = GenerationConfig(
+    max_new_tokens=100,    # Maximum tokens to generate (-1 = auto)
+    temperature=0.8,       # Sampling temperature (0.0 = deterministic)
+    echo=True,            # Echo input prompt in output
+    seq_len=2048,         # Maximum sequence length (-1 = auto)
+    num_bos=0,            # Number of BOS tokens
+    num_eos=0             # Number of EOS tokens
+)
+
+# Modify after creation
+config.temperature = 0.5
+config.max_new_tokens = 50
+```
+
+#### MultimodalInput Types
+```python
+from executorch.extension.llm.runner import (
+    MultimodalInput, make_text_input, make_token_input, 
+    make_image_input, make_audio_input
+)
+
+# Text input
+text_input = make_text_input("Hello, world!")
+print(text_input.is_text())  # True
+print(text_input.get_text())  # "Hello, world!"
+
+# Token input (pre-tokenized)
+token_input = make_token_input([1, 2, 3, 4])
+print(token_input.is_tokens())  # True
+print(token_input.get_tokens())  # [1, 2, 3, 4]
+
+# Image input from torch tensor
+import torch
+image_tensor = torch.randint(0, 255, (224, 224, 3), dtype=torch.uint8)
+image_input = make_image_input(image_tensor)
+print(image_input.is_image())  # True
+image = image_input.get_image()
+print(f"Image: {image.width}x{image.height}x{image.channels}")
+
+# Check input types safely
+if text_input.is_text():
+    text = text_input.get_text()
+elif text_input.is_image():
+    image = text_input.get_image()
+```
+
+#### Stats and Performance Monitoring
+```python
+def detailed_stats_callback(stats):
+    """Comprehensive stats monitoring"""
+    print(f"\n=== Generation Statistics ===")
+    print(f"Prompt tokens: {stats.num_prompt_tokens}")
+    print(f"Generated tokens: {stats.num_generated_tokens}")
+    
+    # Timing breakdown
+    model_load_time = stats.model_load_end_ms - stats.model_load_start_ms
+    if model_load_time > 0:
+        print(f"Model load time: {model_load_time}ms")
+    
+    inference_time = stats.inference_end_ms - stats.inference_start_ms
+    if inference_time > 0:
+        print(f"Total inference time: {inference_time}ms")
+        
+        # Calculate throughput
+        tokens_per_sec = stats.num_generated_tokens * 1000 / inference_time
+        print(f"Generation speed: {tokens_per_sec:.1f} tokens/sec")
+    
+    # Time to first token
+    if stats.first_token_ms > stats.inference_start_ms:
+        ttft = stats.first_token_ms - stats.inference_start_ms
+        print(f"Time to first token: {ttft}ms")
+    
+    # Export to JSON for logging
+    json_stats = stats.to_json_string()
+    print(f"JSON stats: {json_stats}")
+
+# Use in generation
+runner.generate(inputs, config, token_callback, detailed_stats_callback)
+```
+
+### Error Handling
+
+```python
+from executorch.extension.llm.runner import MultimodalRunner, GenerationConfig
+import torch
+
+try:
+    runner = MultimodalRunner("model.pte", "tokenizer.bin")
+    
+    # Invalid image tensor will raise RuntimeError
+    invalid_image = torch.rand(2, 224, 224, 3)  # Wrong number of dimensions
+    inputs = [make_image_input(invalid_image)]
+    
+    config = GenerationConfig(max_new_tokens=50)
+    runner.generate_text(inputs, config)
+    
+except RuntimeError as e:
+    print(f"Generation failed: {e}")
+    
+except FileNotFoundError as e:
+    print(f"Model or tokenizer file not found: {e}")
+```
+
+For more C++ API documentation and implementation details, see the [Core Components](#core-components) section below.
+
 ## Core Components
 
 ### Component Architecture
@@ -474,7 +769,7 @@ DeepFusion is another popular model architecture type where a pretrained encoder
 - **Llama 3.2 Vision**: Uses cross-attention layers for vision-text fusion
 - **Other cross-attention based multimodal models**
 
-For DeepFusion support, consider using the model's native inference framework or wait for future ExecutorTorch updates that may include DeepFusion architecture support.
+For DeepFusion support, consider using the model's native inference framework or wait for future ExecuTorch updates that may include DeepFusion architecture support.
 
 ## Building and Integration
 
diff --git a/extension/llm/runner/__init__.py b/extension/llm/runner/__init__.py
new file mode 100644
index 00000000000..f62d62d3429
--- /dev/null
+++ b/extension/llm/runner/__init__.py
@@ -0,0 +1,235 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Python bindings for ExecuTorch MultimodalRunner.
+
+This module provides a Python interface to the ExecuTorch multimodal LLM runner,
+enabling processing of mixed inputs (text, images, audio) and text generation.
+"""
+
+try:
+    # Import shared components from the compiled C++ extension
+    from executorch.extension.llm.runner._llm_runner import (  # noqa: F401
+        GenerationConfig,
+        Image,
+        make_audio_input,
+        make_image_input,
+        make_raw_audio_input,
+        make_text_input,
+        make_token_input,
+        MultimodalInput,
+        MultimodalRunner,
+        Stats,
+    )
+except ImportError:
+    raise RuntimeError(
+        "LLM runner is not installed. Please build ExecuTorch from source with EXECUTORCH_BUILD_PYBIND=ON"
+    )
+
+
+import logging
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+
+
+def _find_image_token_runs(
+    input_ids: torch.Tensor, image_token_id: Optional[int]
+) -> List[tuple[int, int, int]]:
+    """Return contiguous runs (start, end, length) of image_token_id in input_ids.
+
+    input_ids must be a 1D torch.Tensor. If image_token_id is None, returns an empty list.
+    """
+    if image_token_id is None:
+        return []
+
+    ids_list = input_ids.tolist()
+    runs: List[tuple[int, int, int]] = []
+    i = 0
+    L = len(ids_list)
+    while i < L:
+        if ids_list[i] == image_token_id:
+            j = i
+            while j < L and ids_list[j] == image_token_id:
+                j += 1
+            runs.append((i, j - 1, j - i))
+            i = j
+        else:
+            i += 1
+
+    return runs
+
+
+def _hf_to_multimodal_inputs(  # noqa: C901
+    inputs: BatchFeature, image_token_id: Optional[int] = None
+) -> List[MultimodalInput]:
+    """Convert a HuggingFace AutoProcessor dict to ExecuTorch MultimodalInputs.
+    Currently only support 1 image inside the input.
+
+    Args:
+      - inputs: A BatchFeature containing the input data.
+      - image_token_id: The token ID for the image, if present.
+
+    `inputs` expected keys:
+      - 'input_ids': torch.Tensor of shape (L,) or (1, L)
+      - Optional 'pixel_values': torch.Tensor; if present, must also provide
+        'image_token_id' (or alias 'image_token_index') and there must be
+        exactly one image token occurrence in input_ids.
+
+    Raises:
+      RuntimeError: missing keys, invalid shapes/dtypes, or unsupported cases.
+    """
+    if "input_ids" not in inputs:
+        raise RuntimeError("HF inputs dict must contain 'input_ids' (torch.Tensor)")
+
+    input_ids = inputs["input_ids"]
+    if not isinstance(input_ids, torch.Tensor):
+        raise RuntimeError("'input_ids' must be a torch.Tensor")
+
+    if input_ids.dim() == 2:
+        if input_ids.size(0) != 1:
+            raise RuntimeError(
+                "Expected 'input_ids' with batch size 1 when 2D (shape (1, L))"
+            )
+        input_ids = input_ids.squeeze(0)
+    if input_ids.dim() != 1:
+        raise RuntimeError("'input_ids' must be 1D (L) or 2D with batch size 1")
+
+    has_pixel_values = "pixel_values" in inputs
+
+    # If pixel_values in dict, require image_token_id
+    if has_pixel_values and image_token_id is None:
+        raise RuntimeError("'pixel_values' provided but missing 'image_token_id'")
+
+    # If there are image token ids but no pixel_values, it's an error
+    if (
+        image_token_id is not None
+        and (input_ids == image_token_id).any().item()
+        and not has_pixel_values
+    ):
+        raise RuntimeError(
+            "Found image token(s) in input_ids but 'pixel_values' not provided"
+        )
+
+    # No images: return a single tokens input
+    if not has_pixel_values:
+        return [make_token_input(input_ids.to(torch.long).tolist())]
+
+    # Determine number of images from pixel_values shape
+    pv = inputs["pixel_values"]
+    if not isinstance(pv, torch.Tensor):
+        raise RuntimeError(
+            "'pixel_values' must be a torch.Tensor, run with `return_tensors='pt'` in HF processor"
+        )
+    if pv.dim() == 4:
+        num_images = int(pv.size(0))
+    elif pv.dim() == 3:
+        num_images = 1
+    else:
+        raise RuntimeError(
+            f"'pixel_values' must be 3D (C,H,W) or 4D (N,C,H,W)/(N,H,W,C), got shape {pv.shape}"
+        )
+
+    # Only support batch size 1 for now:
+    if num_images != 1:
+        raise RuntimeError("Only 1 image is supported for now")
+    # Find contiguous runs of image_token_id in input_ids
+    runs = _find_image_token_runs(input_ids, image_token_id)
+
+    if len(runs) == 0:
+        raise RuntimeError(
+            "'pixel_values' provided but no occurrence of 'image_token_id' in input_ids"
+        )
+
+    # Support only one image/run for now; enforce exact match
+    if num_images != 1 or len(runs) != 1:
+        raise RuntimeError(
+            f"Mismatch between images and image token runs: images={num_images}, runs={len(runs)} (only batch=1 and a single contiguous run are supported)"
+        )
+
+    first, last, _ = runs[0]
+
+    combined: List[MultimodalInput] = []
+    if first > 0:
+        combined.append(make_token_input(input_ids[:first].to(torch.long).tolist()))
+
+    # Use C++ checked creator for images (handles 3D/4D, CHW/HWC, uint8/float32)
+    combined.append(make_image_input(inputs["pixel_values"]))
+
+    if (last + 1) < input_ids.numel():
+        combined.append(make_token_input(input_ids[last + 1 :].to(torch.long).tolist()))
+
+    return combined
+
+
+def generate_hf(
+    runner: MultimodalRunner,
+    inputs: Union[BatchFeature, List[MultimodalInput]],
+    config: GenerationConfig,
+    image_token_id: Optional[int] = None,
+    token_callback: Optional[Callable[[str], None]] = None,
+    stats_callback: Optional[Callable[[Stats], None]] = None,
+) -> None:
+    """Generate using an BatchFeature by converting to multimodal inputs internally, or using a list of MultimodalInput."""
+    if isinstance(inputs, BatchFeature):
+        logging.info(
+            "Input is a BatchFeature, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs."
+        )
+        converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id)
+    elif isinstance(inputs, list) and all(
+        isinstance(i, MultimodalInput) for i in inputs
+    ):
+        converted = inputs
+    else:
+        raise RuntimeError(
+            "inputs must be either a BatchFeature (from HF AutoProcessor) or a list of MultimodalInput"
+        )
+
+    runner.generate(converted, config, token_callback, stats_callback)
+
+
+def generate_text_hf(
+    runner: MultimodalRunner,
+    inputs: Union[BatchFeature, List[MultimodalInput]],
+    config: GenerationConfig,
+    image_token_id: Optional[int] = None,
+) -> str:
+    """Generate using an BatchFeature by converting to multimodal inputs internally, or using a list of MultimodalInput."""
+    if isinstance(inputs, BatchFeature):
+        logging.info(
+            "Input is a BatchFeature, assuming it's coming from HF AutoProcessor.apply_chat_template(). Converting to multimodal inputs."
+        )
+        converted = _hf_to_multimodal_inputs(inputs, image_token_id=image_token_id)
+    elif isinstance(inputs, list) and all(
+        isinstance(i, MultimodalInput) for i in inputs
+    ):
+        converted = inputs
+    else:
+        raise RuntimeError(
+            "inputs must be either a BatchFeature (from HF AutoProcessor) or a list of MultimodalInput"
+        )
+
+    return runner.generate_text(converted, config)
+
+
+setattr(MultimodalRunner, "generate_hf", generate_hf)  # noqa B010
+setattr(MultimodalRunner, "generate_text_hf", generate_text_hf)  # noqa B010
+
+
+__all__ = [
+    "GenerationConfig",
+    "Image",
+    "make_audio_input",
+    "make_image_input",
+    "make_raw_audio_input",
+    "make_text_input",
+    "make_token_input",
+    "MultimodalInput",
+    "MultimodalRunner",
+    "Stats",
+]
diff --git a/extension/llm/runner/_llm_runner.pyi b/extension/llm/runner/_llm_runner.pyi
new file mode 100644
index 00000000000..295601b092c
--- /dev/null
+++ b/extension/llm/runner/_llm_runner.pyi
@@ -0,0 +1,523 @@
+"""
+Type stubs for _llm_runner module.
+
+This file provides type annotations for the ExecuTorch LLM Runner Python bindings.
+"""
+
+from typing import Callable, List, Optional, overload
+
+import torch
+
+class GenerationConfig:
+    """Configuration for text generation."""
+
+    echo: bool
+    """Whether to echo the input prompt in the output."""
+
+    max_new_tokens: int
+    """Maximum number of new tokens to generate (-1 for auto)."""
+
+    warming: bool
+    """Whether this is a warmup run (affects perf benchmarking)."""
+
+    seq_len: int
+    """Maximum number of total tokens (-1 for auto)."""
+
+    temperature: float
+    """Temperature for sampling (higher = more random)."""
+
+    num_bos: int
+    """Number of BOS tokens to add to the prompt."""
+
+    num_eos: int
+    """Number of EOS tokens to add to the prompt."""
+
+    def __init__(
+        self,
+        *,
+        echo: bool = True,
+        max_new_tokens: int = -1,
+        warming: bool = False,
+        seq_len: int = -1,
+        temperature: float = 0.8,
+        num_bos: int = 0,
+        num_eos: int = 0,
+    ) -> None:
+        """Initialize GenerationConfig with optional keyword arguments for all fields."""
+        ...
+
+    def resolve_max_new_tokens(
+        self, max_context_len: int, num_prompt_tokens: int
+    ) -> int:
+        """
+        Resolve the maximum number of new tokens to generate based on constraints.
+
+        Args:
+            max_context_len: The maximum context length supported by the model
+            num_prompt_tokens: The number of tokens in the input prompt
+
+        Returns:
+            The resolved maximum number of new tokens to generate
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+
+class Stats:
+    """Statistics for LLM generation performance."""
+
+    SCALING_FACTOR_UNITS_PER_SECOND: int
+    """Scaling factor for timestamps (1000 for milliseconds)."""
+
+    model_load_start_ms: int
+    """Start time of model loading in milliseconds."""
+
+    model_load_end_ms: int
+    """End time of model loading in milliseconds."""
+
+    inference_start_ms: int
+    """Start time of inference in milliseconds."""
+
+    token_encode_end_ms: int
+    """End time of tokenizer encoding in milliseconds."""
+
+    model_execution_start_ms: int
+    """Start time of model execution in milliseconds."""
+
+    model_execution_end_ms: int
+    """End time of model execution in milliseconds."""
+
+    prompt_eval_end_ms: int
+    """End time of prompt evaluation in milliseconds."""
+
+    first_token_ms: int
+    """Timestamp when the first generated token is emitted."""
+
+    inference_end_ms: int
+    """End time of inference/generation in milliseconds."""
+
+    aggregate_sampling_time_ms: int
+    """Total time spent in sampling across all tokens."""
+
+    num_prompt_tokens: int
+    """Number of tokens in the input prompt."""
+
+    num_generated_tokens: int
+    """Number of tokens generated."""
+
+    def on_sampling_begin(self) -> None:
+        """Mark the beginning of a sampling operation."""
+        ...
+
+    def on_sampling_end(self) -> None:
+        """Mark the end of a sampling operation."""
+        ...
+
+    def reset(self, all_stats: bool = False) -> None:
+        """
+        Reset statistics.
+
+        Args:
+            all_stats: If True, reset all stats including model load times.
+                      If False, preserve model load times.
+        """
+        ...
+
+    def to_json_string(self) -> str:
+        """Convert stats to JSON string representation."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class Image:
+    """Container for image data."""
+
+    @overload
+    def __init__(self) -> None:
+        """Initialize an empty Image."""
+        ...
+
+    @overload
+    def __init__(self, data: List[int], width: int, height: int, channels: int) -> None:
+        """Initialize an Image with uint8 data."""
+        ...
+
+    @overload
+    def __init__(
+        self, data: List[float], width: int, height: int, channels: int
+    ) -> None:
+        """Initialize an Image with float data."""
+        ...
+
+    def is_uint8(self) -> bool:
+        """Check if image data is uint8 format."""
+        ...
+
+    def is_float(self) -> bool:
+        """Check if image data is float format."""
+        ...
+
+    @property
+    def width(self) -> int:
+        """Image width in pixels."""
+        ...
+
+    @property
+    def height(self) -> int:
+        """Image height in pixels."""
+        ...
+
+    @property
+    def channels(self) -> int:
+        """Number of color channels (3 for RGB, 4 for RGBA)."""
+        ...
+
+    @property
+    def uint8_data(self) -> List[int]:
+        """Raw image data as uint8 values."""
+        ...
+
+    @property
+    def float_data(self) -> List[float]:
+        """Raw image data as float values."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class Audio:
+    """Container for preprocessed audio data."""
+
+    data: List[int]
+    """Raw audio data as a list of uint8 values."""
+
+    batch_size: int
+    """Batch size of the audio data."""
+
+    n_bins: int
+    """Number of frequency bins (for spectrograms)."""
+
+    n_frames: int
+    """Number of time frames."""
+
+    @overload
+    def __init__(self) -> None:
+        """Initialize an empty Audio."""
+        ...
+
+    @overload
+    def __init__(
+        self, data: List[int], batch_size: int, n_bins: int, n_frames: int
+    ) -> None:
+        """Initialize Audio with preprocessed data."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class RawAudio:
+    """Container for raw audio data."""
+
+    data: List[int]
+    """Raw audio data as a list of uint8 values."""
+
+    batch_size: int
+    """Batch size of the audio data."""
+
+    n_channels: int
+    """Number of audio channels (1 for mono, 2 for stereo)."""
+
+    n_samples: int
+    """Number of audio samples."""
+
+    @overload
+    def __init__(self) -> None:
+        """Initialize an empty RawAudio."""
+        ...
+
+    @overload
+    def __init__(
+        self, data: List[int], batch_size: int, n_channels: int, n_samples: int
+    ) -> None:
+        """Initialize RawAudio with raw data."""
+        ...
+
+    def __repr__(self) -> str: ...
+
+class MultimodalInput:
+    """Container for multimodal input data (text, image, audio, etc.)."""
+
+    @overload
+    def __init__(self, text: str) -> None:
+        """
+        Create a MultimodalInput with text.
+
+        Args:
+            text: The input text string
+        """
+        ...
+
+    @overload
+    def __init__(self, image: Image) -> None:
+        """
+        Create a MultimodalInput with an image.
+
+        Args:
+            image: The input image
+        """
+        ...
+
+    @overload
+    def __init__(self, audio: Audio) -> None:
+        """
+        Create a MultimodalInput with preprocessed audio.
+
+        Args:
+            audio: The input audio data
+        """
+        ...
+
+    @overload
+    def __init__(self, raw_audio: RawAudio) -> None:
+        """
+        Create a MultimodalInput with raw audio.
+
+        Args:
+            raw_audio: The input raw audio data
+        """
+        ...
+
+    def is_text(self) -> bool:
+        """Check if this input contains text."""
+        ...
+
+    def is_image(self) -> bool:
+        """Check if this input contains an image."""
+        ...
+
+    def is_audio(self) -> bool:
+        """Check if this input contains preprocessed audio."""
+        ...
+
+    def is_raw_audio(self) -> bool:
+        """Check if this input contains raw audio."""
+        ...
+
+    def get_text(self) -> Optional[str]:
+        """
+        Get the text content if this is a text input.
+
+        Returns:
+            The text string if this is a text input, None otherwise
+        """
+        ...
+
+    def get_image(self) -> Optional[Image]:
+        """
+        Get the image content if this is an image input.
+
+        Returns:
+            The Image object if this is an image input, None otherwise
+        """
+        ...
+
+    def get_audio(self) -> Optional[Audio]:
+        """
+        Get the audio content if this is an audio input.
+
+        Returns:
+            The Audio object if this is an audio input, None otherwise
+        """
+        ...
+
+    def get_raw_audio(self) -> Optional[RawAudio]:
+        """
+        Get the raw audio content if this is a raw audio input.
+
+        Returns:
+            The RawAudio object if this is a raw audio input, None otherwise
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+
+class MultimodalRunner:
+    """Runner for multimodal language models."""
+
+    def __init__(
+        self, model_path: str, tokenizer_path: str, data_path: Optional[str] = None
+    ) -> None:
+        """
+        Initialize a MultimodalRunner.
+
+        Args:
+            model_path: Path to the model file (.pte)
+            tokenizer_path: Path to the tokenizer file
+            data_path: Optional path to additional data file
+        Raises:
+            RuntimeError: If initialization fails
+        """
+        ...
+
+    def generate(
+        self,
+        inputs: List[MultimodalInput],
+        config: GenerationConfig,
+        token_callback: Optional[Callable[[str], None]] = None,
+        stats_callback: Optional[Callable[[Stats], None]] = None,
+    ) -> None:
+        """
+        Generate text from multimodal inputs.
+
+        Args:
+            inputs: List of multimodal inputs (text, images, etc.)
+            config: Generation configuration
+            token_callback: Optional callback called for each generated token
+            stats_callback: Optional callback called with generation statistics
+
+        Raises:
+            RuntimeError: If generation fails
+        """
+        ...
+
+    def generate_hf(
+        self,
+        inputs: dict,
+        config: GenerationConfig,
+        token_callback: Optional[Callable[[str], None]] = None,
+        stats_callback: Optional[Callable[[Stats], None]] = None,
+        image_token_id: Optional[int] = None,
+    ) -> None:
+        """
+        Generate text directly from a HuggingFace processor dict.
+
+        Expects at least 'input_ids' (torch.Tensor). If 'pixel_values' is provided,
+        an 'image_token_id' (or 'image_token_index') must also be present to locate
+        the image position(s) in input_ids.
+
+        Args:
+            inputs: HF processor outputs (e.g., from AutoProcessor.apply_chat_template)
+            config: Generation configuration
+            token_callback: Optional per-token callback
+            stats_callback: Optional stats callback
+            image_token_id: Optional image token ID (or index)
+
+        Raises:
+            RuntimeError: If required keys are missing, shapes are invalid, or generation fails
+        """
+        ...
+
+    def prefill(self, inputs: List[MultimodalInput]) -> None:
+        """
+        Prefill multimodal inputs (e.g., to rebuild KV cache from chat history)
+        without generating tokens.
+
+        Args:
+            inputs: List of multimodal inputs to prefill
+
+        Raises:
+            RuntimeError: If prefill fails
+        """
+        ...
+
+    def generate_text(
+        self, inputs: List[MultimodalInput], config: GenerationConfig
+    ) -> str:
+        """
+        Generate text and return the complete result as a string.
+
+        Args:
+            inputs: List of multimodal inputs (text, images, etc.)
+            config: Generation configuration
+
+        Returns:
+            The generated text as a string
+
+        Raises:
+            RuntimeError: If generation fails
+        """
+        ...
+
+    def generate_text_hf(
+        self, inputs: dict, config: GenerationConfig, image_token_id
+    ) -> str:
+        """
+        Generate text directly from a HuggingFace processor dict and return as string.
+
+        See generate_hf(inputs: dict, ...) for expected keys and constraints.
+        """
+        ...
+
+    def stop(self) -> None:
+        """Stop the current generation process."""
+        ...
+
+    def reset(self) -> None:
+        """Reset the runner state and KV cache."""
+        ...
+
+    def get_vocab_size(self) -> int:
+        """
+        Get the vocabulary size of the model.
+
+        Returns:
+            The vocabulary size, or -1 if not available
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+
+def make_text_input(text: str) -> MultimodalInput:
+    """
+    Create a text input for multimodal processing.
+
+    Args:
+        text: The input text string
+
+    Returns:
+        A MultimodalInput containing the text
+    """
+    ...
+
+def make_image_input(image_tensor: torch.Tensor) -> MultimodalInput:
+    """
+    Create an image input from a torch tensor.
+
+    Args:
+        image_tensor: Torch tensor with shape (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)
+
+    Returns:
+        A MultimodalInput containing the image
+
+    Raises:
+        RuntimeError: If the tensor has invalid dimensions or number of channels
+    """
+    ...
+
+def make_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput:
+    """
+    Create a preprocessed audio input from a torch tensor.
+
+    Args:
+        audio_tensor: Torch tensor with shape (batch_size, n_bins, n_frames)
+
+    Returns:
+        A MultimodalInput containing the preprocessed audio
+
+    Raises:
+        RuntimeError: If the tensor has invalid dimensions or dtype
+    """
+    ...
+
+def make_raw_audio_input(audio_tensor: torch.Tensor) -> MultimodalInput:
+    """
+    Create a raw audio input from a torch tensor.
+
+    Args:
+        audio_tensor: Torch tensor with shape (batch_size, n_channels, n_samples)
+
+    Returns:
+        A MultimodalInput containing the raw audio
+
+    Raises:
+        RuntimeError: If the tensor has invalid dimensions or dtype
+    """
+    ...
diff --git a/extension/llm/runner/audio.h b/extension/llm/runner/audio.h
index 868765950af..ce71513ed17 100644
--- a/extension/llm/runner/audio.h
+++ b/extension/llm/runner/audio.h
@@ -11,8 +11,11 @@
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
@@ -29,14 +32,126 @@ struct ET_EXPERIMENTAL RawAudio {
 };
 
 /**
- * Pre-processed audio inputs, ready to feed directly into an audio
- * encoder.
+ * Pre-processed audio inputs, ready to feed directly into an audio encoder.
+ *
+ * The data can be either uint8_t or float. If the audio has gone through a Mel
+ * transform, we expect the data type to be float (i.e., std::vector<float>), as
+ * Mel spectrograms are typically represented as floating point values. For raw
+ * or quantized audio, uint8_t may be used instead.
  */
-struct ET_EXPERIMENTAL Audio {
-  std::vector<uint8_t> data;
-  int32_t batch_size;
-  int32_t n_bins;
-  int32_t n_frames;
+class ET_EXPERIMENTAL Audio final {
+ public:
+  // Default constructor
+  Audio() : batch_size_(0), n_bins_(0), n_frames_(0) {}
+
+  // Constructor for uint8_t data
+  Audio(
+      std::vector<uint8_t>&& data,
+      int32_t batch_size,
+      int32_t n_bins,
+      int32_t n_frames)
+      : data_(std::move(data)),
+        batch_size_(batch_size),
+        n_bins_(n_bins),
+        n_frames_(n_frames) {
+    ET_CHECK_MSG(
+        data_.index() == 0 &&
+            std::get<std::vector<uint8_t>>(data_).size() ==
+                static_cast<size_t>(batch_size * n_bins * n_frames),
+        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
+        std::get<std::vector<uint8_t>>(data_).size(),
+        batch_size * n_bins * n_frames);
+  }
+
+  // Constructor for float data
+  Audio(
+      std::vector<float>&& data,
+      int32_t batch_size,
+      int32_t n_bins,
+      int32_t n_frames)
+      : data_(std::move(data)),
+        batch_size_(batch_size),
+        n_bins_(n_bins),
+        n_frames_(n_frames) {
+    ET_CHECK_MSG(
+        data_.index() == 1 &&
+            std::get<std::vector<float>>(data_).size() ==
+                static_cast<size_t>(batch_size * n_bins * n_frames),
+        "data.size() (%zu) does not match batch_size * n_bins * n_frames (%d)",
+        std::get<std::vector<float>>(data_).size(),
+        batch_size * n_bins * n_frames);
+  }
+
+  // Type checkers
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  // Data access
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  int32_t get_batch_size() const {
+    return batch_size_;
+  }
+  int32_t get_n_bins() const {
+    return n_bins_;
+  }
+  int32_t get_n_frames() const {
+    return n_frames_;
+  }
+  /**
+   * Convert the audio data to a TensorPtr, with optional batch dimension.
+   * The tensor will have shape (batch_size, n_bins, n_frames) or (1,
+   * batch_size, n_bins, n_frames) if with_batch is true.
+   */
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    std::vector<executorch::aten::SizesType> sizes = {
+        get_batch_size(), get_n_bins(), get_n_frames()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error,
+        "Shouldn't reach here, audio data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
+  // Members
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t batch_size_;
+  int32_t n_bins_;
+  int32_t n_frames_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/constants.h b/extension/llm/runner/constants.h
index 4ba88203c50..d7b36077757 100644
--- a/extension/llm/runner/constants.h
+++ b/extension/llm/runner/constants.h
@@ -20,7 +20,7 @@ inline constexpr auto kUseKVCache = "use_kv_cache";
 inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 
 // Multimodal method name conventions
-inline constexpr auto kImageEncoderMethod = "image_encoder";
+inline constexpr auto kVisionEncoderMethod = "vision_encoder";
 inline constexpr auto kAudioEncoderMethod = "audio_encoder";
 inline constexpr auto kTokenEmbeddingMethod = "token_embedding";
 inline constexpr auto kTextModelMethod = "text_decoder";
diff --git a/extension/llm/runner/image.h b/extension/llm/runner/image.h
index 67fb8939518..dbdba273536 100644
--- a/extension/llm/runner/image.h
+++ b/extension/llm/runner/image.h
@@ -10,19 +10,112 @@
 
 #pragma once
 #include <executorch/runtime/platform/compiler.h>
+#include <cstddef>
 #include <cstdint>
+#include <variant>
 #include <vector>
 
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
 namespace executorch {
 namespace extension {
 namespace llm {
 
-struct ET_EXPERIMENTAL Image {
+class ET_EXPERIMENTAL Image {
+ public:
+  // Default constructor
+  Image() : width_(0), height_(0), channels_(0) {}
+
+  // Constructor for uint8_t data
+  Image(
+      std::vector<uint8_t>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Constructor for float data
+  Image(
+      std::vector<float>&& data,
+      int32_t width,
+      int32_t height,
+      int32_t channels)
+      : data_(std::move(data)),
+        width_(width),
+        height_(height),
+        channels_(channels) {}
+
+  // Getters
+  int32_t width() const {
+    return width_;
+  }
+  int32_t height() const {
+    return height_;
+  }
+  int32_t channels() const {
+    return channels_;
+  }
+
+  // Data access
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  const std::vector<uint8_t>& get_uint8_data() const& {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  std::vector<uint8_t>& get_uint8_data() & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+
+  const std::vector<float>& get_float_data() const& {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  std::vector<float>& get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  executorch::runtime::Result<executorch::extension::TensorPtr> toTensor(
+      bool with_batch = false) const {
+    // Note: This creates a 3D tensor (CHW). The model might expect a 4D
+    // tensor (NCHW). The caller should handle reshaping if needed.
+    std::vector<executorch::aten::SizesType> sizes = {
+        channels(), height(), width()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return executorch::extension::from_blob(
+          const_cast<float*>(get_float_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return executorch::extension::from_blob(
+          const_cast<uint8_t*>(get_uint8_data().data()),
+          sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(
+        Error, "Image data is not initialized with uint8_t or float vector.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+ private:
   // Assuming NCHW format
-  std::vector<uint8_t> data;
-  int32_t width;
-  int32_t height;
-  int32_t channels;
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t width_;
+  int32_t height_;
+  int32_t channels_;
 };
 
 } // namespace llm
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
index f12de5f1d87..d1e4ff2ce45 100644
--- a/extension/llm/runner/llm_runner_helper.cpp
+++ b/extension/llm/runner/llm_runner_helper.cpp
@@ -183,6 +183,24 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::optional<const std::string> data_path,
     float temperature) {
+  if (data_path.has_value()) {
+    std::vector<std::string> data_files;
+    data_files.push_back(data_path.value());
+    return create_text_llm_runner(
+        model_path, std::move(tokenizer), std::move(data_files), temperature);
+  }
+  return create_text_llm_runner(
+      model_path,
+      std::move(tokenizer),
+      std::vector<std::string>(),
+      temperature);
+}
+
+std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::vector<std::string> data_files,
+    float temperature) {
   // Sanity check tokenizer
   if (!tokenizer || !tokenizer->is_loaded()) {
     ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -191,9 +209,9 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
 
   // Create the Module
   std::unique_ptr<Module> module;
-  if (data_path.has_value()) {
+  if (data_files.size() > 0) {
     module = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
+        model_path, data_files, Module::LoadMode::File);
   } else {
     module = std::make_unique<Module>(model_path, Module::LoadMode::File);
   }
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
index 191ea3ab090..5c109581e19 100644
--- a/extension/llm/runner/llm_runner_helper.h
+++ b/extension/llm/runner/llm_runner_helper.h
@@ -101,7 +101,28 @@ ET_EXPERIMENTAL std::unordered_set<uint64_t> get_eos_ids(
 ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
-    std::optional<const std::string> data_path = std::nullopt,
+    std::optional<const std::string> data_path,
+    float temperature = -1.0f);
+
+/**
+ * @brief Creates a TextLLMRunner instance with dependency injection
+ *
+ * This factory function creates and initializes a TextLLMRunner with all
+ * necessary components for text generation using the specified model and
+ * tokenizer.
+ *
+ * @param model_path Path to the model file
+ * @param tokenizer Initialized tokenizer instance
+ * @param data_files Vector of paths to additional data required by the model
+ * @param temperature Optional temperature parameter for controlling randomness
+ * (deprecated)
+ * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
+ * nullptr on failure
+ */
+ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
+    const std::string& model_path,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::vector<std::string> data_files = {},
     float temperature = -1.0f);
 
 /**
diff --git a/extension/llm/runner/multimodal_input.h b/extension/llm/runner/multimodal_input.h
index 728d8aef08f..01f73e3314c 100644
--- a/extension/llm/runner/multimodal_input.h
+++ b/extension/llm/runner/multimodal_input.h
@@ -14,8 +14,10 @@
 #include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/runtime/platform/compiler.h>
+#include <cstdint>
 #include <string>
 #include <variant>
+#include <vector>
 
 namespace executorch::extension::llm {
 
@@ -29,15 +31,46 @@ class ET_EXPERIMENTAL MultimodalInput {
   /// Type of multimodal input data
   enum class Type {
     TEXT, ///< Text string input
+    TOKENS, ///< Tokenizer encoded input (vector of token IDs)
     IMAGE, ///< Processed image input
     AUDIO, ///< Processed audio input
     RAW_AUDIO, ///< Raw unprocessed audio input (straight from audio file)
     UNSUPPORTED ///< Unsupported input type
   };
 
+  /**
+   * Return a human-readable name for a MultimodalInput::Type.
+   * Preferred for logging and debugging; returns string literals.
+   */
+  static constexpr const char* TypeName(Type t) noexcept {
+    switch (t) {
+      case Type::TEXT:
+        return "text";
+      case Type::TOKENS:
+        return "tokens";
+      case Type::IMAGE:
+        return "image";
+      case Type::AUDIO:
+        return "audio";
+      case Type::RAW_AUDIO:
+        return "raw_audio";
+      default:
+        return "unknown";
+    }
+  }
+
+  /** Convenience wrapper that returns a std::string. */
+  static inline std::string TypeToString(Type t) {
+    return TypeName(t);
+  }
+
   // Constructors
   explicit MultimodalInput(const std::string& text) : data_(text) {}
   explicit MultimodalInput(std::string&& text) : data_(std::move(text)) {}
+  explicit MultimodalInput(const std::vector<uint64_t>& tokens)
+      : data_(tokens) {}
+  explicit MultimodalInput(std::vector<uint64_t>&& tokens)
+      : data_(std::move(tokens)) {}
   explicit MultimodalInput(const Image& image) : data_(image) {}
   explicit MultimodalInput(Image&& image) : data_(std::move(image)) {}
   explicit MultimodalInput(const Audio& audio) : data_(audio) {}
@@ -65,6 +98,13 @@ class ET_EXPERIMENTAL MultimodalInput {
     return std::holds_alternative<std::string>(data_);
   }
 
+  /**
+   * Check if this input contains pre-tokenized data.
+   */
+  bool is_tokens() const noexcept {
+    return std::holds_alternative<std::vector<uint64_t>>(data_);
+  }
+
   /**
    * Check if this input contains image data.
    * @return true if this input contains an image, false otherwise.
@@ -97,6 +137,8 @@ class ET_EXPERIMENTAL MultimodalInput {
   Type get_type() const noexcept {
     if (is_text())
       return Type::TEXT;
+    if (is_tokens())
+      return Type::TOKENS;
     if (is_image())
       return Type::IMAGE;
     if (is_audio())
@@ -106,6 +148,15 @@ class ET_EXPERIMENTAL MultimodalInput {
     return Type::UNSUPPORTED;
   }
 
+  /**
+   * Get a human-readable name for the contained input type.
+   * Returns one of: "text", "tokens", "image", "audio", "raw_audio", or
+   * "unknown".
+   */
+  const char* type_name() const noexcept {
+    return TypeName(get_type());
+  }
+
   /**
    * Get the text data from this input.
    * @return Reference to the stored text string.
@@ -133,6 +184,21 @@ class ET_EXPERIMENTAL MultimodalInput {
     return std::get<std::string>(std::move(data_));
   }
 
+  /**
+   * Get the token vector from this input.
+   */
+  const std::vector<uint64_t>& get_tokens() const& {
+    return std::get<std::vector<uint64_t>>(data_);
+  }
+
+  std::vector<uint64_t>& get_tokens() & {
+    return std::get<std::vector<uint64_t>>(data_);
+  }
+
+  std::vector<uint64_t>&& get_tokens() && {
+    return std::get<std::vector<uint64_t>>(std::move(data_));
+  }
+
   /**
    * Get the image data from this input.
    * @return Reference to the stored Image object.
@@ -250,6 +316,16 @@ class ET_EXPERIMENTAL MultimodalInput {
     return std::get_if<Image>(&data_);
   }
 
+  /** Try to get the tokens from this input safely. */
+  const std::vector<uint64_t>* try_get_tokens() const noexcept {
+    return std::get_if<std::vector<uint64_t>>(&data_);
+  }
+
+  /** Try to get the tokens from this input safely (mutable). */
+  std::vector<uint64_t>* try_get_tokens() noexcept {
+    return std::get_if<std::vector<uint64_t>>(&data_);
+  }
+
   /**
    * Try to get the audio data from this input safely.
    * @return Pointer to the Audio object if this input contains audio,
@@ -287,7 +363,8 @@ class ET_EXPERIMENTAL MultimodalInput {
   }
 
  private:
-  std::variant<std::string, Image, Audio, RawAudio> data_;
+  std::variant<std::string, std::vector<uint64_t>, Image, Audio, RawAudio>
+      data_;
 };
 
 // Convenience factory functions
@@ -307,6 +384,16 @@ inline MultimodalInput make_image_input(Image&& image) noexcept {
   return MultimodalInput(std::move(image));
 }
 
+inline MultimodalInput make_token_input(
+    const std::vector<uint64_t>& tokens) noexcept {
+  return MultimodalInput(tokens);
+}
+
+inline MultimodalInput make_token_input(
+    std::vector<uint64_t>&& tokens) noexcept {
+  return MultimodalInput(std::move(tokens));
+}
+
 inline MultimodalInput make_audio_input(const Audio& audio) noexcept {
   return MultimodalInput(audio);
 }
diff --git a/extension/llm/runner/multimodal_prefiller.cpp b/extension/llm/runner/multimodal_prefiller.cpp
index 2705a9eadff..7f5a8356979 100644
--- a/extension/llm/runner/multimodal_prefiller.cpp
+++ b/extension/llm/runner/multimodal_prefiller.cpp
@@ -40,25 +40,99 @@ Result<uint64_t> MultimodalPrefiller::prefill(
   // 1. Run encoder model.
   ::executorch::runtime::EValue encoder_output;
   if (input.is_image()) {
-    Image image = input.get_image();
-    auto image_tensor = executorch::extension::from_blob(
-        image.data.data(),
-        {3, image.height, image.width},
-        ::executorch::aten::ScalarType::Byte);
+    const Image& image = input.get_image();
 
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kVisionEncoderMethod),
+        "Failed to get method_meta for %s",
+        kVisionEncoderMethod);
+
+    ET_CHECK_OR_RETURN_ERROR(
+        method_meta.num_inputs() > 0,
+        InvalidArgument,
+        "Image encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    if (expected_dtype == ::executorch::aten::ScalarType::Float) {
+      ET_CHECK_OR_RETURN_ERROR(
+          image.is_float(),
+          InvalidArgument,
+          "Model expects float image data, but image has uint8_t data.");
+    } else if (expected_dtype == ::executorch::aten::ScalarType::Byte) {
+      ET_CHECK_OR_RETURN_ERROR(
+          image.is_uint8(),
+          InvalidArgument,
+          "Model expects uint8_t image data, but image has float data.");
+    } else {
+      ET_CHECK_OR_RETURN_ERROR(
+          false,
+          NotSupported,
+          "Unsupported image encoder input dtype: %s",
+          ::executorch::runtime::toString(expected_dtype));
+    }
+
+    // The model might expect a 4D tensor (NCHW), but toTensor() returns a 3D
+    // tensor (CHW). Add a batch dimension of 1 if needed.
+    auto expected_dims = input_meta.sizes();
+    auto image_tensor = ET_UNWRAP(
+        image.toTensor(/*with_batch*/ expected_dims.size() == 4),
+        "Failed to convert image to tensor");
+    ET_LOG(
+        Info,
+        "Image tensor dim: %zu, dtype: %s",
+        image_tensor->dim(),
+        ::executorch::runtime::toString(image_tensor->scalar_type()));
     // Run image encoder
     auto image_encoder_outputs =
-        ET_UNWRAP(module_->execute(kImageEncoderMethod, image_tensor));
+        ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
 
     encoder_output = image_encoder_outputs[0];
   } else if (input.is_audio()) {
-    Audio audio = input.get_audio();
+    const Audio& audio = input.get_audio();
+
+    auto method_meta = ET_UNWRAP(
+        module_->method_meta(kAudioEncoderMethod),
+        "Failed to get method_meta for %s",
+        kAudioEncoderMethod);
 
-    // Use the original tensor shape as intended
-    auto audio_tensor = executorch::extension::from_blob(
-        audio.data.data(),
-        {audio.batch_size, audio.n_bins, audio.n_frames},
-        ::executorch::aten::ScalarType::Float);
+    ET_CHECK_OR_RETURN_ERROR(
+        method_meta.num_inputs() > 0,
+        InvalidArgument,
+        "Audio encoder should have at least 1 input");
+    auto input_meta = ET_UNWRAP(
+        method_meta.input_tensor_meta(0),
+        "Cannot get input tensor meta at index 0");
+    auto expected_dtype = input_meta.scalar_type();
+
+    // Create tensor with original dtype
+    auto audio_tensor =
+        ET_UNWRAP(audio.toTensor(), "Failed to convert audio to tensor");
+
+    // Convert to expected dtype if needed
+    if (audio_tensor->scalar_type() != expected_dtype) {
+      if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
+        // Convert to bfloat16
+        audio_tensor = ET_UNWRAP(
+            convert_to_bfloat16(audio_tensor),
+            "Failed to convert audio tensor to bfloat16");
+      } else {
+        ET_CHECK_OR_RETURN_ERROR(
+            false,
+            NotSupported,
+            "Unsupported audio encoder input dtype: %s. Expecting %s",
+            ::executorch::runtime::toString(audio_tensor->scalar_type()),
+            ::executorch::runtime::toString(expected_dtype));
+      }
+    }
+
+    ET_LOG(
+        Info,
+        "Audio tensor dim: %zu, dtype: %s",
+        audio_tensor->dim(),
+        ::executorch::runtime::toString(audio_tensor->scalar_type()));
 
     // Run audio encoder
     auto audio_encoder_result =
@@ -69,10 +143,14 @@ Result<uint64_t> MultimodalPrefiller::prefill(
     auto audio_encoder_outputs = audio_encoder_result.get();
 
     encoder_output = audio_encoder_outputs[0];
-  } else if (input.is_text()) {
-    auto& text = input.get_text();
-    std::vector<uint64_t> tokens =
-        ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
+  } else if (input.is_text() || input.is_tokens()) {
+    std::vector<uint64_t> tokens;
+    if (input.is_text()) {
+      auto& text = input.get_text();
+      tokens = ET_UNWRAP_TOKENIZER(tokenizer_->encode(text));
+    } else {
+      tokens = input.get_tokens();
+    }
 
     auto text_tensor = executorch::extension::from_blob(
         tokens.data(),
@@ -143,8 +221,8 @@ ::executorch::runtime::Error MultimodalPrefiller::load() {
       ET_UNWRAP(module_->method_names(), "Failed to get method names");
 
   // Load image_encoder method if exists.
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kImageEncoderMethod));
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
   }
 
   if (methods.find(kAudioEncoderMethod) != methods.end()) {
@@ -171,8 +249,8 @@ bool MultimodalPrefiller::is_method_loaded() {
     ET_CHECK_MSG(false, "Failed to get method names");
   }
   std::unordered_set<std::string> methods = methods_res.get();
-  if (methods.find(kImageEncoderMethod) != methods.end()) {
-    return module_->is_method_loaded(kImageEncoderMethod);
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    return module_->is_method_loaded(kVisionEncoderMethod);
   }
   return true;
 }
diff --git a/extension/llm/runner/multimodal_runner.cpp b/extension/llm/runner/multimodal_runner.cpp
index b63277c82d2..8b7e4e315d8 100644
--- a/extension/llm/runner/multimodal_runner.cpp
+++ b/extension/llm/runner/multimodal_runner.cpp
@@ -62,6 +62,16 @@ Error MultimodalRunner::load() {
     ET_LOG(Info, format, __VA_ARGS__);     \
   }
 
+Error MultimodalRunner::prefill(const std::vector<MultimodalInput>& inputs) {
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+  for (auto& input : inputs) {
+    ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
+  }
+  return Error::Ok;
+}
+
 Error MultimodalRunner::generate(
     const std::vector<MultimodalInput>& inputs,
     const GenerationConfig& config,
@@ -106,6 +116,12 @@ Error MultimodalRunner::generate(
   // Process multimodal inputs in order
   for (size_t i = 0; i < inputs.size(); ++i) {
     const MultimodalInput& input = inputs[i];
+    ET_LOG(
+        Info,
+        "Prefilling input %zu/%zu, type: %s",
+        i,
+        inputs.size(),
+        input.type_name());
     if (config.echo && i == inputs.size() - 1 && input.is_text()) {
       wrapped_callback(input.get_text());
     }
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
index fe5d1d7f1d7..caf3c296038 100644
--- a/extension/llm/runner/multimodal_runner.h
+++ b/extension/llm/runner/multimodal_runner.h
@@ -119,6 +119,15 @@ class ET_EXPERIMENTAL MultimodalRunner {
       std::function<void(const std::string&)> token_callback = {},
       std::function<void(const Stats&)> stats_callback = {});
 
+  /**
+   * Prefill multimodal inputs, for example to reload chat history.
+   * @param inputs A vector of MultimodalInput objects containing images and
+   * text.
+   * @return The error code. KV cache position is tracked internally in pos_.
+   */
+  virtual ::executorch::runtime::Error prefill(
+      const std::vector<MultimodalInput>& inputs);
+
   inline void stop() {
     text_token_generator_->stop();
   }
diff --git a/extension/llm/runner/pybindings.cpp b/extension/llm/runner/pybindings.cpp
new file mode 100644
index 00000000000..bcc6aba0f8e
--- /dev/null
+++ b/extension/llm/runner/pybindings.cpp
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/python.h>
+
+#include <executorch/extension/llm/runner/audio.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/multimodal_input.h>
+#include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/sampler/sampler.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <pytorch/tokenizers/tokenizer.h>
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace py = pybind11;
+using namespace executorch::extension::llm;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+// Helper macro for error handling
+#define THROW_IF_ERROR(error, message, ...)                       \
+  ({                                                              \
+    if ((error) != Error::Ok) {                                   \
+      char msg_buf[256];                                          \
+      snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
+      throw std::runtime_error(msg_buf);                          \
+    }                                                             \
+  })
+
+// Python wrapper class for MultimodalRunner
+class PyMultimodalRunner {
+ public:
+  // Constructor that takes a tokenizer path
+  PyMultimodalRunner(
+      const std::string& model_path,
+      const std::string& tokenizer_path,
+      std::optional<const std::string> data_path = std::nullopt) {
+    // Load tokenizer using the helper function
+    auto tokenizer =
+        load_tokenizer(tokenizer_path, nullptr, std::nullopt, 0, 0);
+    if (!tokenizer) {
+      throw std::runtime_error(
+          "Failed to load tokenizer from: " + tokenizer_path);
+    }
+
+    // Create multimodal runner using the helper function
+    runner_ =
+        create_multimodal_runner(model_path, std::move(tokenizer), data_path);
+    if (!runner_) {
+      throw std::runtime_error(
+          "Failed to create multimodal runner with model: " + model_path);
+    }
+  }
+
+  void generate(
+      const std::vector<MultimodalInput>& inputs,
+      const GenerationConfig& config,
+      py::object token_callback = py::none(),
+      py::object stats_callback = py::none()) {
+    if (!runner_) {
+      throw std::runtime_error("Runner not initialized");
+    }
+
+    // Convert Python callbacks to C++ std::function
+    std::function<void(const std::string&)> cpp_token_callback = nullptr;
+    if (!token_callback.is_none()) {
+      cpp_token_callback = [token_callback](const std::string& token) {
+        py::gil_scoped_acquire acquire;
+        token_callback(token);
+      };
+    }
+
+    std::function<void(const Stats&)> cpp_stats_callback = nullptr;
+    if (!stats_callback.is_none()) {
+      cpp_stats_callback = [stats_callback](const Stats& stats) {
+        py::gil_scoped_acquire acquire;
+        stats_callback(stats);
+      };
+    }
+
+    // Release GIL during generation
+    {
+      py::gil_scoped_release release;
+      Error error = runner_->generate(
+          inputs, config, cpp_token_callback, cpp_stats_callback);
+      THROW_IF_ERROR(error, "Generation failed");
+    }
+  }
+
+  std::string generate_text(
+      const std::vector<MultimodalInput>& inputs,
+      const GenerationConfig& config) {
+    if (!runner_) {
+      throw std::runtime_error("Runner not initialized");
+    }
+
+    std::string generated_text;
+    auto cpp_token_callback = [&generated_text](const std::string& token) {
+      generated_text += token;
+    };
+    Error error =
+        runner_->generate(inputs, config, cpp_token_callback, nullptr);
+    THROW_IF_ERROR(error, "Generation failed");
+
+    return generated_text;
+  }
+
+  void stop() {
+    if (runner_) {
+      runner_->stop();
+    }
+  }
+
+  void reset() {
+    if (runner_) {
+      runner_->reset();
+    }
+  }
+
+  void prefill(std::vector<MultimodalInput> inputs) {
+    if (!runner_) {
+      throw std::runtime_error("Runner not initialized");
+    }
+    {
+      py::gil_scoped_release release;
+      Error error = runner_->prefill(inputs);
+      THROW_IF_ERROR(error, "Prefill failed");
+    }
+  }
+
+  // Note: Since the runner owns the tokenizer and metadata after creation,
+  // we cannot directly access them. This is a limitation of the current design.
+  // For now, we'll return a placeholder value.
+  int32_t get_vocab_size() const {
+    // TODO: Consider exposing metadata through the MultimodalRunner interface
+    return -1; // Indicate that vocab size is not available
+  }
+
+ private:
+  std::unique_ptr<MultimodalRunner> runner_;
+};
+
+PYBIND11_MODULE(_llm_runner, m) {
+  m.doc() = "Python bindings for ExecuTorch LLM Runners";
+
+  // Initialize ExecuTorch runtime
+  runtime_init();
+
+  // Bind GenerationConfig
+  py::class_<GenerationConfig>(m, "GenerationConfig")
+      // Constructor with keyword arguments for all fields (all optional via
+      // defaults)
+      .def(
+          py::init([](bool echo,
+                      int32_t max_new_tokens,
+                      bool warming,
+                      int32_t seq_len,
+                      float temperature,
+                      int32_t num_bos,
+                      int32_t num_eos) {
+            GenerationConfig cfg;
+            cfg.echo = echo;
+            cfg.max_new_tokens = max_new_tokens;
+            cfg.warming = warming;
+            cfg.seq_len = seq_len;
+            cfg.temperature = temperature;
+            cfg.num_bos = num_bos;
+            cfg.num_eos = num_eos;
+            return cfg;
+          }),
+          py::arg("echo") = true,
+          py::arg("max_new_tokens") = -1,
+          py::arg("warming") = false,
+          py::arg("seq_len") = -1,
+          py::arg("temperature") = 0.8f,
+          py::arg("num_bos") = 0,
+          py::arg("num_eos") = 0)
+      .def_readwrite("echo", &GenerationConfig::echo)
+      .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens)
+      .def_readwrite("warming", &GenerationConfig::warming)
+      .def_readwrite("seq_len", &GenerationConfig::seq_len)
+      .def_readwrite("temperature", &GenerationConfig::temperature)
+      .def_readwrite("num_bos", &GenerationConfig::num_bos)
+      .def_readwrite("num_eos", &GenerationConfig::num_eos)
+      .def(
+          "resolve_max_new_tokens",
+          &GenerationConfig::resolve_max_new_tokens,
+          py::arg("max_context_len"),
+          py::arg("num_prompt_tokens"),
+          "Resolve the maximum number of new tokens to generate based on constraints")
+      .def("__repr__", [](const GenerationConfig& config) {
+        return "<GenerationConfig max_new_tokens=" +
+            std::to_string(config.max_new_tokens) +
+            " seq_len=" + std::to_string(config.seq_len) +
+            " temperature=" + std::to_string(config.temperature) +
+            " echo=" + (config.echo ? "True" : "False") +
+            " warming=" + (config.warming ? "True" : "False") + ">";
+      });
+
+  // Bind Stats
+  py::class_<Stats>(m, "Stats")
+      .def_readonly(
+          "SCALING_FACTOR_UNITS_PER_SECOND",
+          &Stats::SCALING_FACTOR_UNITS_PER_SECOND)
+      .def_readonly("model_load_start_ms", &Stats::model_load_start_ms)
+      .def_readonly("model_load_end_ms", &Stats::model_load_end_ms)
+      .def_readonly("inference_start_ms", &Stats::inference_start_ms)
+      .def_readonly("token_encode_end_ms", &Stats::token_encode_end_ms)
+      .def_readonly(
+          "model_execution_start_ms", &Stats::model_execution_start_ms)
+      .def_readonly("model_execution_end_ms", &Stats::model_execution_end_ms)
+      .def_readonly("prompt_eval_end_ms", &Stats::prompt_eval_end_ms)
+      .def_readonly("first_token_ms", &Stats::first_token_ms)
+      .def_readonly("inference_end_ms", &Stats::inference_end_ms)
+      .def_readonly(
+          "aggregate_sampling_time_ms", &Stats::aggregate_sampling_time_ms)
+      .def_readonly("num_prompt_tokens", &Stats::num_prompt_tokens)
+      .def_readonly("num_generated_tokens", &Stats::num_generated_tokens)
+      .def("on_sampling_begin", &Stats::on_sampling_begin)
+      .def("on_sampling_end", &Stats::on_sampling_end)
+      .def(
+          "reset",
+          &Stats::reset,
+          py::arg("all_stats") = false,
+          "Reset stats, optionally including model load times")
+      .def(
+          "to_json_string",
+          [](const Stats& stats) { return stats_to_json_string(stats); },
+          "Convert stats to JSON string representation")
+      .def("__repr__", [](const Stats& stats) {
+        double tokens_per_second = 0.0;
+        if (stats.inference_end_ms > stats.inference_start_ms) {
+          tokens_per_second = static_cast<double>(stats.num_generated_tokens) *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND /
+              (stats.inference_end_ms - stats.inference_start_ms);
+        }
+        return "<Stats num_prompt_tokens=" +
+            std::to_string(stats.num_prompt_tokens) + " num_generated_tokens=" +
+            std::to_string(stats.num_generated_tokens) +
+            " tokens_per_second=" + std::to_string(tokens_per_second) + ">";
+      });
+
+  // Bind Image class
+  py::class_<Image>(m, "Image")
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("width"),
+          py::arg("height"),
+          py::arg("channels"))
+      .def(
+          py::init<std::vector<float>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("width"),
+          py::arg("height"),
+          py::arg("channels"))
+      .def("is_uint8", &Image::is_uint8)
+      .def("is_float", &Image::is_float)
+      .def_property_readonly("width", &Image::width)
+      .def_property_readonly("height", &Image::height)
+      .def_property_readonly("channels", &Image::channels)
+      .def_property_readonly(
+          "uint8_data",
+          static_cast<const std::vector<uint8_t>& (Image::*)() const&>(
+              &Image::get_uint8_data))
+      .def_property_readonly(
+          "float_data",
+          static_cast<const std::vector<float>& (Image::*)() const&>(
+              &Image::get_float_data))
+      .def("__repr__", [](const Image& img) {
+        std::string dtype = "unknown";
+        if (img.is_uint8()) {
+          dtype = "uint8";
+        } else if (img.is_float()) {
+          dtype = "float32";
+        }
+        return "<Image height=" + std::to_string(img.height()) +
+            " width=" + std::to_string(img.width()) +
+            " channels=" + std::to_string(img.channels()) + " dtype=" + dtype +
+            ">";
+      });
+
+  // Bind Audio class
+  py::class_<Audio>(m, "Audio")
+      .def(py::init<>())
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("batch_size"),
+          py::arg("n_bins"),
+          py::arg("n_frames"),
+          "Create preprocessed audio data (uint8)")
+      .def(
+          py::init<std::vector<float>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("batch_size"),
+          py::arg("n_bins"),
+          py::arg("n_frames"),
+          "Create preprocessed audio data (float32)")
+      .def("is_uint8", &Audio::is_uint8)
+      .def("is_float", &Audio::is_float)
+      .def_property_readonly(
+          "uint8_data",
+          static_cast<const std::vector<uint8_t>& (Audio::*)() const&>(
+              &Audio::get_uint8_data))
+      .def_property_readonly(
+          "float_data",
+          static_cast<const std::vector<float>& (Audio::*)() const&>(
+              &Audio::get_float_data))
+      .def_property_readonly("batch_size", &Audio::get_batch_size)
+      .def_property_readonly("n_bins", &Audio::get_n_bins)
+      .def_property_readonly("n_frames", &Audio::get_n_frames)
+      .def("toTensor", &Audio::toTensor)
+      .def("__repr__", [](const Audio& audio) {
+        std::string dtype = "unknown";
+        if (audio.is_uint8()) {
+          dtype = "uint8";
+        } else if (audio.is_float()) {
+          dtype = "float32";
+        }
+        return "<Audio batch_size=" + std::to_string(audio.get_batch_size()) +
+            " n_bins=" + std::to_string(audio.get_n_bins()) +
+            " n_frames=" + std::to_string(audio.get_n_frames()) +
+            " dtype=" + dtype + ">";
+      });
+
+  // Bind RawAudio class
+  py::class_<RawAudio>(m, "RawAudio")
+      .def(py::init<>())
+      .def(
+          py::init<std::vector<uint8_t>&&, int32_t, int32_t, int32_t>(),
+          py::arg("data"),
+          py::arg("batch_size"),
+          py::arg("n_channels"),
+          py::arg("n_samples"),
+          "Create raw audio data")
+      .def_readwrite("data", &RawAudio::data)
+      .def_readwrite("batch_size", &RawAudio::batch_size)
+      .def_readwrite("n_channels", &RawAudio::n_channels)
+      .def_readwrite("n_samples", &RawAudio::n_samples)
+      .def("__repr__", [](const RawAudio& audio) {
+        return "<RawAudio batch_size=" + std::to_string(audio.batch_size) +
+            " n_channels=" + std::to_string(audio.n_channels) +
+            " n_samples=" + std::to_string(audio.n_samples) + ">";
+      });
+
+  // Bind MultimodalInput
+  py::class_<MultimodalInput>(m, "MultimodalInput")
+      .def(
+          py::init<const std::string&>(),
+          py::arg("text"),
+          "Create a MultimodalInput with text")
+      .def(
+          py::init<const std::vector<uint64_t>&>(),
+          py::arg("tokens"),
+          "Create a MultimodalInput with pre-tokenized tokens (List[int])")
+      .def(
+          py::init<const std::vector<uint64_t>&>(),
+          py::arg("tokens"),
+          "Create a MultimodalInput with pre-tokenized tokens (List[int])")
+      .def(
+          py::init<const Image&>(),
+          py::arg("image"),
+          "Create a MultimodalInput with an image")
+      .def(
+          py::init<const Audio&>(),
+          py::arg("audio"),
+          "Create a MultimodalInput with preprocessed audio")
+      .def(
+          py::init<const RawAudio&>(),
+          py::arg("raw_audio"),
+          "Create a MultimodalInput with raw audio")
+      .def("is_text", &MultimodalInput::is_text)
+      .def("is_tokens", &MultimodalInput::is_tokens)
+      .def("is_image", &MultimodalInput::is_image)
+      .def("is_audio", &MultimodalInput::is_audio)
+      .def("is_raw_audio", &MultimodalInput::is_raw_audio)
+      .def(
+          "get_text",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_text()) {
+              return py::cast(input.get_text());
+            }
+            return py::none();
+          })
+      .def(
+          "get_tokens",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_tokens()) {
+              return py::cast(input.get_tokens());
+            }
+            return py::none();
+          })
+      .def(
+          "get_image",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_image()) {
+              return py::cast(input.get_image());
+            }
+            return py::none();
+          })
+      .def(
+          "get_audio",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_audio()) {
+              return py::cast(input.get_audio());
+            }
+            return py::none();
+          })
+      .def(
+          "get_raw_audio",
+          [](const MultimodalInput& input) -> py::object {
+            if (input.is_raw_audio()) {
+              return py::cast(input.get_raw_audio());
+            }
+            return py::none();
+          })
+      .def("__repr__", [](const MultimodalInput& input) -> std::string {
+        if (input.is_text()) {
+          return "<MultimodalInput type=text content=\"" +
+              input.get_text().substr(0, 50) +
+              (input.get_text().length() > 50 ? "..." : "") + "\">";
+        } else if (input.is_image()) {
+          return "<MultimodalInput type=image>";
+        } else if (input.is_tokens()) {
+          return "<MultimodalInput type=tokens>";
+        } else if (input.is_audio()) {
+          return "<MultimodalInput type=audio>";
+        } else if (input.is_raw_audio()) {
+          return "<MultimodalInput type=raw_audio>";
+        }
+        return "<MultimodalInput type=unknown>";
+      });
+
+  // Bind helper functions using lambdas
+  m.def(
+      "make_token_input",
+      [](py::sequence tokens) -> MultimodalInput {
+        std::vector<uint64_t> vec;
+        vec.reserve(py::len(tokens));
+        for (auto item : tokens) {
+          uint64_t v = py::cast<uint64_t>(item);
+          vec.push_back(v);
+        }
+        return MultimodalInput(std::move(vec));
+      },
+      "Create a token input from a Python sequence of ints",
+      py::arg("tokens"));
+
+  m.def(
+      "make_text_input",
+      [](const std::string& text) -> MultimodalInput {
+        return MultimodalInput(text);
+      },
+      "Create a text input for multimodal processing",
+      py::arg("text"));
+
+  m.def(
+      "make_image_input",
+      [](torch::Tensor image_tensor) -> MultimodalInput {
+        if (image_tensor.dim() == 4) {
+          if (image_tensor.size(0) != 1) {
+            throw std::runtime_error(
+                "Batch size for 4D image tensor must be 1");
+          }
+          image_tensor = image_tensor.squeeze(0);
+        }
+
+        if (image_tensor.dim() != 3) {
+          throw std::runtime_error(
+              "Image tensor must be 3-dimensional (H, W, C) or 4-dimensional (1, H, W, C)");
+        }
+
+        int64_t height, width, channels;
+        // Check for memory format and permute to CHW if necessary
+        if (image_tensor.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+          // Input is HWC, permute to CHW
+          height = image_tensor.size(0);
+          width = image_tensor.size(1);
+          channels = image_tensor.size(2);
+          image_tensor = image_tensor.permute({2, 0, 1});
+        } else if (image_tensor.is_contiguous(at::MemoryFormat::Contiguous)) {
+          // Input is CHW
+          channels = image_tensor.size(0);
+          height = image_tensor.size(1);
+          width = image_tensor.size(2);
+        } else {
+          throw std::runtime_error(
+              "Image tensor must be contiguous in either channels last (H, W, C) or contiguous (C, H, W) format.");
+        }
+
+        if (channels != 3 && channels != 4) {
+          throw std::runtime_error(
+              "Image must have 3 (RGB) or 4 (RGBA) channels");
+        }
+
+        image_tensor = image_tensor.contiguous();
+        if (image_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = image_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> image_data(data, data + image_tensor.numel());
+          return MultimodalInput(Image(
+              std::move(image_data),
+              static_cast<int32_t>(width),
+              static_cast<int32_t>(height),
+              static_cast<int32_t>(channels)));
+        } else if (image_tensor.scalar_type() == torch::kFloat) {
+          float* data = image_tensor.data_ptr<float>();
+          std::vector<float> image_data(data, data + image_tensor.numel());
+          return MultimodalInput(Image(
+              std::move(image_data),
+              static_cast<int32_t>(width),
+              static_cast<int32_t>(height),
+              static_cast<int32_t>(channels)));
+        } else {
+          throw std::runtime_error(
+              "Unsupported image tensor dtype. Only uint8 and float32 are supported.");
+        }
+      },
+      "Create an image input from a torch tensor (H, W, C), (1, H, W, C), (C, H, W), or (1, C, H, W)",
+      py::arg("image_tensor"));
+
+  m.def(
+      "make_audio_input",
+      [](torch::Tensor audio_tensor) -> MultimodalInput {
+        if (audio_tensor.dim() != 3) {
+          throw std::runtime_error(
+              "Audio tensor must be 3-dimensional (batch_size, n_bins, n_frames)");
+        }
+
+        int64_t batch_size = audio_tensor.size(0);
+        int64_t n_bins = audio_tensor.size(1);
+        int64_t n_frames = audio_tensor.size(2);
+
+        audio_tensor = audio_tensor.contiguous();
+        if (audio_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = audio_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> audio_data(data, data + audio_tensor.numel());
+          return MultimodalInput(Audio(
+              std::move(audio_data),
+              static_cast<int32_t>(batch_size),
+              static_cast<int32_t>(n_bins),
+              static_cast<int32_t>(n_frames)));
+        } else if (audio_tensor.scalar_type() == torch::kFloat) {
+          float* data = audio_tensor.data_ptr<float>();
+          std::vector<float> audio_data(data, data + audio_tensor.numel());
+          return MultimodalInput(Audio(
+              std::move(audio_data),
+              static_cast<int32_t>(batch_size),
+              static_cast<int32_t>(n_bins),
+              static_cast<int32_t>(n_frames)));
+        } else {
+          throw std::runtime_error(
+              "Unsupported audio tensor dtype. Only uint8 and float32 are supported for preprocessed audio.");
+        }
+      },
+      "Create a preprocessed audio input from a torch tensor (batch_size, n_bins, n_frames)",
+      py::arg("audio_tensor"));
+
+  m.def(
+      "make_raw_audio_input",
+      [](torch::Tensor audio_tensor) -> MultimodalInput {
+        if (audio_tensor.dim() != 3) {
+          throw std::runtime_error(
+              "Raw audio tensor must be 3-dimensional (batch_size, n_channels, n_samples)");
+        }
+
+        int64_t batch_size = audio_tensor.size(0);
+        int64_t n_channels = audio_tensor.size(1);
+        int64_t n_samples = audio_tensor.size(2);
+
+        audio_tensor = audio_tensor.contiguous();
+        if (audio_tensor.scalar_type() == torch::kUInt8) {
+          uint8_t* data = audio_tensor.data_ptr<uint8_t>();
+          std::vector<uint8_t> audio_data(data, data + audio_tensor.numel());
+          return MultimodalInput(RawAudio{
+              std::move(audio_data),
+              static_cast<int32_t>(batch_size),
+              static_cast<int32_t>(n_channels),
+              static_cast<int32_t>(n_samples)});
+        } else {
+          throw std::runtime_error(
+              "Unsupported raw audio tensor dtype. Only uint8 is supported for raw audio.");
+        }
+      },
+      "Create a raw audio input from a torch tensor (batch_size, n_channels, n_samples)",
+      py::arg("audio_tensor"));
+
+  // Bind PyMultimodalRunner
+  py::class_<PyMultimodalRunner>(m, "MultimodalRunner")
+      // Constructor with tokenizer path
+      .def(
+          py::init<
+              const std::string&,
+              const std::string&,
+              std::optional<const std::string>>(),
+          py::arg("model_path"),
+          py::arg("tokenizer_path"),
+          py::arg("data_path") = py::none(),
+          "Initialize a MultimodalRunner with model and tokenizer paths")
+      .def(
+          "generate",
+          &PyMultimodalRunner::generate,
+          py::arg("inputs"),
+          py::arg("config"),
+          py::arg("token_callback") = py::none(),
+          py::arg("stats_callback") = py::none(),
+          "Generate text from multimodal inputs with optional callbacks")
+      .def(
+          "prefill",
+          &PyMultimodalRunner::prefill,
+          py::arg("inputs"),
+          "Prefill multimodal inputs (e.g., chat history) without generating tokens")
+      .def("stop", &PyMultimodalRunner::stop, "Stop the current generation")
+      .def(
+          "generate_text",
+          &PyMultimodalRunner::generate_text,
+          py::arg("inputs"),
+          py::arg("config"),
+          "Generate text from multimodal inputs and return the complete "
+          "result")
+      .def(
+          "reset",
+          &PyMultimodalRunner::reset,
+          "Reset the runner state and KV cache")
+      .def(
+          "get_vocab_size",
+          &PyMultimodalRunner::get_vocab_size,
+          "Get the vocabulary size of the model")
+      .def("__repr__", [](const PyMultimodalRunner& runner) {
+        return "<MultimodalRunner>";
+      });
+}
\ No newline at end of file
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
index 242860a195a..e001e8fc154 100644
--- a/extension/llm/runner/targets.bzl
+++ b/extension/llm/runner/targets.bzl
@@ -105,6 +105,7 @@ def define_common_targets():
             exported_headers = [
                 "audio.h",
                 "image.h",
+                "wav_loader.h",
                 "multimodal_input.h",
                 "multimodal_runner.h",
                 "multimodal_prefiller.h",
diff --git a/extension/llm/runner/test/CMakeLists.txt b/extension/llm/runner/test/CMakeLists.txt
index 2aa18000831..81b69c0ab9a 100644
--- a/extension/llm/runner/test/CMakeLists.txt
+++ b/extension/llm/runner/test/CMakeLists.txt
@@ -18,8 +18,13 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
-    test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
-    test_text_decoder_runner.cpp test_multimodal_input.cpp
+    test_generation_config.cpp
+    test_text_llm_runner.cpp
+    test_text_prefiller.cpp
+    test_text_decoder_runner.cpp
+    test_multimodal_input.cpp
+    test_util.cpp
+    test_wav_loader.cpp
 )
 
 # Add LSan stub for Apple platforms
diff --git a/extension/llm/runner/test/targets.bzl b/extension/llm/runner/test/targets.bzl
index 3339b3b8584..1109ff315ac 100644
--- a/extension/llm/runner/test/targets.bzl
+++ b/extension/llm/runner/test/targets.bzl
@@ -44,3 +44,23 @@ def define_common_targets():
             "//executorch/extension/llm/runner:multimodal_runner_lib",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_util",
+        srcs = ["test_util.cpp"],
+        deps = [
+            "//executorch/extension/llm/runner:stats",
+            "//executorch/extension/tensor:tensor",
+            "//executorch/runtime/core:core",
+        ],
+    )
+
+    runtime.cxx_test(
+        name = "test_wav_loader",
+        srcs = ["test_wav_loader.cpp"],
+        deps = [
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/extension/llm/runner:multimodal_runner_lib",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
diff --git a/extension/llm/runner/test/test_multimodal_input.cpp b/extension/llm/runner/test/test_multimodal_input.cpp
index 97b9cc1379e..85d45d69173 100644
--- a/extension/llm/runner/test/test_multimodal_input.cpp
+++ b/extension/llm/runner/test/test_multimodal_input.cpp
@@ -14,9 +14,9 @@ using namespace ::testing;
 using executorch::extension::llm::Image;
 using executorch::extension::llm::make_image_input;
 using executorch::extension::llm::make_text_input;
+using executorch::extension::llm::make_token_input;
 using executorch::extension::llm::MultimodalInput;
 
-namespace {
 class MultimodalInputTest : public Test {
  protected:
   std::string createTestText() {
@@ -28,21 +28,13 @@ class MultimodalInputTest : public Test {
   }
 
   Image createTestImage() {
-    Image img;
-    img.width = 224;
-    img.height = 224;
-    img.channels = 3;
-    img.data = std::vector<uint8_t>(224 * 224 * 3, 128); // Fill with gray
-    return img;
+    std::vector<uint8_t> data(224 * 224 * 3, 128); // Fill with gray
+    return Image(std::move(data), 224, 224, 3);
   }
 
   Image createTestImageSmall() {
-    Image img;
-    img.width = 32;
-    img.height = 32;
-    img.channels = 1;
-    img.data = std::vector<uint8_t>(32 * 32, 255); // Fill with white
-    return img;
+    std::vector<uint8_t> data(32 * 32, 255); // Fill with white
+    return Image(std::move(data), 32, 32, 1);
   }
 };
 
@@ -76,28 +68,28 @@ TEST_F(MultimodalInputTest, ImageConstructorFromImage) {
   EXPECT_FALSE(input.is_text());
   EXPECT_TRUE(input.is_image());
   EXPECT_EQ(input.get_type(), MultimodalInput::Type::IMAGE);
-  EXPECT_EQ(input.get_image().width, 224);
-  EXPECT_EQ(input.get_image().height, 224);
-  EXPECT_EQ(input.get_image().channels, 3);
-  EXPECT_EQ(input.get_image().data.size(), 224 * 224 * 3);
+  EXPECT_EQ(input.get_image().width(), 224);
+  EXPECT_EQ(input.get_image().height(), 224);
+  EXPECT_EQ(input.get_image().channels(), 3);
+  EXPECT_EQ(input.get_image().get_uint8_data().size(), 224 * 224 * 3);
 }
 
 TEST_F(MultimodalInputTest, ImageConstructorFromRvalueImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
-  size_t data_size = img.data.size();
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
+  size_t data_size = img.get_uint8_data().size();
 
   MultimodalInput input(std::move(img));
 
   EXPECT_FALSE(input.is_text());
   EXPECT_TRUE(input.is_image());
   EXPECT_EQ(input.get_type(), MultimodalInput::Type::IMAGE);
-  EXPECT_EQ(input.get_image().width, width);
-  EXPECT_EQ(input.get_image().height, height);
-  EXPECT_EQ(input.get_image().channels, channels);
-  EXPECT_EQ(input.get_image().data.size(), data_size);
+  EXPECT_EQ(input.get_image().width(), width);
+  EXPECT_EQ(input.get_image().height(), height);
+  EXPECT_EQ(input.get_image().channels(), channels);
+  EXPECT_EQ(input.get_image().get_uint8_data().size(), data_size);
 }
 
 // Test copy constructor and assignment
@@ -129,10 +121,10 @@ TEST_F(MultimodalInputTest, CopyConstructorImage) {
   MultimodalInput copy(original);
 
   EXPECT_TRUE(copy.is_image());
-  EXPECT_EQ(copy.get_image().width, 224);
-  EXPECT_EQ(copy.get_image().height, 224);
-  EXPECT_EQ(copy.get_image().channels, 3);
-  EXPECT_EQ(original.get_image().width, 224); // Original should be unchanged
+  EXPECT_EQ(copy.get_image().width(), 224);
+  EXPECT_EQ(copy.get_image().height(), 224);
+  EXPECT_EQ(copy.get_image().channels(), 3);
+  EXPECT_EQ(original.get_image().width(), 224); // Original should be unchanged
 }
 
 TEST_F(MultimodalInputTest, CopyAssignmentImage) {
@@ -143,10 +135,10 @@ TEST_F(MultimodalInputTest, CopyAssignmentImage) {
   copy = original;
 
   EXPECT_TRUE(copy.is_image());
-  EXPECT_EQ(copy.get_image().width, 224);
-  EXPECT_EQ(copy.get_image().height, 224);
-  EXPECT_EQ(copy.get_image().channels, 3);
-  EXPECT_EQ(original.get_image().width, 224); // Original should be unchanged
+  EXPECT_EQ(copy.get_image().width(), 224);
+  EXPECT_EQ(copy.get_image().height(), 224);
+  EXPECT_EQ(copy.get_image().channels(), 3);
+  EXPECT_EQ(original.get_image().width(), 224); // Original should be unchanged
 }
 
 // Test move constructor and assignment
@@ -174,32 +166,32 @@ TEST_F(MultimodalInputTest, MoveAssignmentText) {
 
 TEST_F(MultimodalInputTest, MoveConstructorImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
   MultimodalInput original(std::move(img));
   MultimodalInput moved(std::move(original));
 
   EXPECT_TRUE(moved.is_image());
-  EXPECT_EQ(moved.get_image().width, width);
-  EXPECT_EQ(moved.get_image().height, height);
-  EXPECT_EQ(moved.get_image().channels, channels);
+  EXPECT_EQ(moved.get_image().width(), width);
+  EXPECT_EQ(moved.get_image().height(), height);
+  EXPECT_EQ(moved.get_image().channels(), channels);
 }
 
 TEST_F(MultimodalInputTest, MoveAssignmentImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
   MultimodalInput original(std::move(img));
   MultimodalInput moved(createTestText()); // Start with different type
 
   moved = std::move(original);
 
   EXPECT_TRUE(moved.is_image());
-  EXPECT_EQ(moved.get_image().width, width);
-  EXPECT_EQ(moved.get_image().height, height);
-  EXPECT_EQ(moved.get_image().channels, channels);
+  EXPECT_EQ(moved.get_image().width(), width);
+  EXPECT_EQ(moved.get_image().height(), height);
+  EXPECT_EQ(moved.get_image().channels(), channels);
 }
 
 // Test getter methods with correct types
@@ -227,16 +219,13 @@ TEST_F(MultimodalInputTest, GetImageWithImageInput) {
 
   // Test const lvalue reference version
   const MultimodalInput& const_input = input;
-  EXPECT_EQ(const_input.get_image().width, 224);
-
-  // Test mutable lvalue reference version
-  Image& mutable_image = input.get_image();
-  mutable_image.width = 448;
-  EXPECT_EQ(input.get_image().width, 448);
+  EXPECT_EQ(const_input.get_image().width(), 224);
+  EXPECT_EQ(const_input.get_image().height(), 224);
+  EXPECT_EQ(const_input.get_image().channels(), 3);
 
   // Test rvalue reference version
   Image moved_image = std::move(input).get_image();
-  EXPECT_EQ(moved_image.width, 448);
+  EXPECT_EQ(moved_image.width(), 224);
 }
 
 // Test getter methods with wrong types (should throw)
@@ -296,18 +285,14 @@ TEST_F(MultimodalInputTest, TryGetImageWithImageInput) {
   const MultimodalInput& const_input = input;
   const Image* image_ptr = const_input.try_get_image();
   ASSERT_NE(image_ptr, nullptr);
-  EXPECT_EQ(image_ptr->width, 224);
-  EXPECT_EQ(image_ptr->height, 224);
-  EXPECT_EQ(image_ptr->channels, 3);
+  EXPECT_EQ(image_ptr->width(), 224);
+  EXPECT_EQ(image_ptr->height(), 224);
+  EXPECT_EQ(image_ptr->channels(), 3);
 
   // Test mutable version
   Image* mutable_image_ptr = input.try_get_image();
   ASSERT_NE(mutable_image_ptr, nullptr);
-  EXPECT_EQ(mutable_image_ptr->width, 224);
-
-  // Modify through pointer
-  mutable_image_ptr->width = 448;
-  EXPECT_EQ(input.get_image().width, 448);
+  EXPECT_EQ(mutable_image_ptr->width(), 224);
 }
 
 TEST_F(MultimodalInputTest, TryGetImageWithTextInput) {
@@ -344,22 +329,22 @@ TEST_F(MultimodalInputTest, MakeImageInputFromImage) {
   MultimodalInput input = make_image_input(img);
 
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, 224);
-  EXPECT_EQ(input.get_image().height, 224);
-  EXPECT_EQ(input.get_image().channels, 3);
+  EXPECT_EQ(input.get_image().width(), 224);
+  EXPECT_EQ(input.get_image().height(), 224);
+  EXPECT_EQ(input.get_image().channels(), 3);
 }
 
 TEST_F(MultimodalInputTest, MakeImageInputFromRvalueImage) {
   Image img = createTestImage();
-  int width = img.width;
-  int height = img.height;
-  int channels = img.channels;
+  int width = img.width();
+  int height = img.height();
+  int channels = img.channels();
   MultimodalInput input = make_image_input(std::move(img));
 
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, width);
-  EXPECT_EQ(input.get_image().height, height);
-  EXPECT_EQ(input.get_image().channels, channels);
+  EXPECT_EQ(input.get_image().width(), width);
+  EXPECT_EQ(input.get_image().height(), height);
+  EXPECT_EQ(input.get_image().channels(), channels);
 }
 
 // Test with different image sizes
@@ -368,10 +353,10 @@ TEST_F(MultimodalInputTest, DifferentImageSizes) {
   MultimodalInput input(small_img);
 
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, 32);
-  EXPECT_EQ(input.get_image().height, 32);
-  EXPECT_EQ(input.get_image().channels, 1);
-  EXPECT_EQ(input.get_image().data.size(), 32 * 32);
+  EXPECT_EQ(input.get_image().width(), 32);
+  EXPECT_EQ(input.get_image().height(), 32);
+  EXPECT_EQ(input.get_image().channels(), 1);
+  EXPECT_EQ(input.get_image().get_uint8_data().size(), 32 * 32);
 }
 
 // Test with empty text
@@ -424,11 +409,264 @@ TEST_F(MultimodalInputTest, AssignmentBetweenTypes) {
   // Assign image to text input
   input = MultimodalInput(img);
   EXPECT_TRUE(input.is_image());
-  EXPECT_EQ(input.get_image().width, 224);
+  EXPECT_EQ(input.get_image().width(), 224);
 
   // Assign text back to image input
   input = MultimodalInput(text);
   EXPECT_TRUE(input.is_text());
   EXPECT_EQ(input.get_text(), text);
 }
-} // namespace
+
+// Token-related tests
+class MultimodalInputTokenTest : public Test {
+ protected:
+  std::vector<uint64_t> createTestTokens() {
+    return {1, 2, 3, 4, 5};
+  }
+};
+
+// Test token constructors
+TEST_F(MultimodalInputTokenTest, TokenConstructorFromVector) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_FALSE(input.is_text());
+  EXPECT_FALSE(input.is_image());
+  EXPECT_EQ(input.get_type(), MultimodalInput::Type::TOKENS);
+  EXPECT_EQ(input.get_tokens(), tokens);
+  EXPECT_EQ(input.get_tokens().size(), 5);
+}
+
+TEST_F(MultimodalInputTokenTest, TokenConstructorFromRvalueVector) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  std::vector<uint64_t> original_tokens = tokens;
+  MultimodalInput input(std::move(tokens));
+
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_FALSE(input.is_text());
+  EXPECT_FALSE(input.is_image());
+  EXPECT_EQ(input.get_type(), MultimodalInput::Type::TOKENS);
+  EXPECT_EQ(input.get_tokens(), original_tokens);
+  EXPECT_EQ(input.get_tokens().size(), 5);
+}
+
+// Test token type checking
+TEST_F(MultimodalInputTokenTest, TokenTypeChecking) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_FALSE(input.is_text());
+  EXPECT_FALSE(input.is_image());
+  EXPECT_FALSE(input.is_audio());
+  EXPECT_FALSE(input.is_raw_audio());
+  EXPECT_EQ(input.get_type(), MultimodalInput::Type::TOKENS);
+  EXPECT_STREQ(input.type_name(), "tokens");
+}
+
+// Test token getters
+TEST_F(MultimodalInputTokenTest, GetTokensWithTokenInput) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+
+  // Test const lvalue reference version
+  const MultimodalInput& const_input = input;
+  EXPECT_EQ(const_input.get_tokens(), tokens);
+  EXPECT_EQ(const_input.get_tokens().size(), 5);
+
+  // Test mutable lvalue reference version
+  std::vector<uint64_t>& mutable_tokens = input.get_tokens();
+  mutable_tokens.push_back(6);
+  EXPECT_EQ(input.get_tokens().size(), 6);
+  EXPECT_EQ(input.get_tokens().back(), 6);
+
+  // Test rvalue reference version
+  std::vector<uint64_t> moved_tokens = std::move(input).get_tokens();
+  EXPECT_EQ(moved_tokens.size(), 6);
+  EXPECT_EQ(moved_tokens.back(), 6);
+}
+
+// Test token getters with wrong types (should throw)
+TEST_F(MultimodalInputTokenTest, GetTokensWithTextInputThrows) {
+  std::string text = "Hello";
+  MultimodalInput input(text);
+
+  EXPECT_THROW(input.get_tokens(), std::bad_variant_access);
+  EXPECT_THROW(std::move(input).get_tokens(), std::bad_variant_access);
+}
+
+TEST_F(MultimodalInputTokenTest, GetTextWithTokenInputThrows) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+
+  EXPECT_THROW(input.get_text(), std::bad_variant_access);
+  EXPECT_THROW(std::move(input).get_text(), std::bad_variant_access);
+}
+
+// Test safe token getters (try_get_*)
+TEST_F(MultimodalInputTokenTest, TryGetTokensWithTokenInput) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+
+  // Test const version
+  const MultimodalInput& const_input = input;
+  const std::vector<uint64_t>* tokens_ptr = const_input.try_get_tokens();
+  ASSERT_NE(tokens_ptr, nullptr);
+  EXPECT_EQ(*tokens_ptr, tokens);
+
+  // Test mutable version
+  std::vector<uint64_t>* mutable_tokens_ptr = input.try_get_tokens();
+  ASSERT_NE(mutable_tokens_ptr, nullptr);
+  EXPECT_EQ(*mutable_tokens_ptr, tokens);
+
+  // Modify through pointer
+  mutable_tokens_ptr->push_back(100);
+  EXPECT_EQ(input.get_tokens().size(), 6);
+  EXPECT_EQ(input.get_tokens().back(), 100);
+}
+
+TEST_F(MultimodalInputTokenTest, TryGetTokensWithTextInput) {
+  std::string text = "Hello";
+  MultimodalInput input(text);
+
+  // Should return nullptr for wrong type
+  EXPECT_EQ(input.try_get_tokens(), nullptr);
+
+  const MultimodalInput& const_input = input;
+  EXPECT_EQ(const_input.try_get_tokens(), nullptr);
+}
+
+// Test token convenience factory functions
+TEST_F(MultimodalInputTokenTest, MakeTokenInputFromVector) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input = make_token_input(tokens);
+
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_EQ(input.get_tokens(), tokens);
+  EXPECT_EQ(input.get_tokens().size(), 5);
+}
+
+TEST_F(MultimodalInputTokenTest, MakeTokenInputFromRvalueVector) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  std::vector<uint64_t> original_tokens = tokens;
+  MultimodalInput input = make_token_input(std::move(tokens));
+
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_EQ(input.get_tokens(), original_tokens);
+  EXPECT_EQ(input.get_tokens().size(), 5);
+}
+
+// Test token copy semantics
+TEST_F(MultimodalInputTokenTest, TokenCopyConstructor) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput original(tokens);
+  MultimodalInput copy(original);
+
+  EXPECT_TRUE(copy.is_tokens());
+  EXPECT_EQ(copy.get_tokens(), tokens);
+  EXPECT_EQ(original.get_tokens(), tokens); // Original should be unchanged
+
+  // Modify copy, original should be unaffected
+  copy.get_tokens().push_back(999);
+  EXPECT_EQ(copy.get_tokens().size(), 6);
+  EXPECT_EQ(original.get_tokens().size(), 5);
+}
+
+TEST_F(MultimodalInputTokenTest, TokenCopyAssignment) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput original(tokens);
+  MultimodalInput copy("initial text"); // Start with different type
+
+  copy = original;
+
+  EXPECT_TRUE(copy.is_tokens());
+  EXPECT_EQ(copy.get_tokens(), tokens);
+  EXPECT_EQ(original.get_tokens(), tokens); // Original should be unchanged
+}
+
+// Test token move semantics
+TEST_F(MultimodalInputTokenTest, TokenMoveConstructor) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  std::vector<uint64_t> original_tokens = tokens;
+  MultimodalInput original(std::move(tokens));
+  MultimodalInput moved(std::move(original));
+
+  EXPECT_TRUE(moved.is_tokens());
+  EXPECT_EQ(moved.get_tokens(), original_tokens);
+}
+
+TEST_F(MultimodalInputTokenTest, TokenMoveAssignment) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  std::vector<uint64_t> original_tokens = tokens;
+  MultimodalInput original(std::move(tokens));
+  MultimodalInput moved("initial text"); // Start with different type
+
+  moved = std::move(original);
+
+  EXPECT_TRUE(moved.is_tokens());
+  EXPECT_EQ(moved.get_tokens(), original_tokens);
+}
+
+// Test TypeName and TypeToString static methods for TOKENS
+TEST_F(MultimodalInputTokenTest, TypeNameAndToString) {
+  EXPECT_STREQ(
+      MultimodalInput::TypeName(MultimodalInput::Type::TOKENS), "tokens");
+  EXPECT_EQ(
+      MultimodalInput::TypeToString(MultimodalInput::Type::TOKENS), "tokens");
+
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+  EXPECT_STREQ(input.type_name(), "tokens");
+}
+
+// Test assignment between token and other types
+TEST_F(MultimodalInputTokenTest, AssignmentBetweenTokensAndOtherTypes) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  std::string text = "Hello";
+
+  MultimodalInput input(tokens);
+  EXPECT_TRUE(input.is_tokens());
+
+  // Assign text to token input
+  input = MultimodalInput(text);
+  EXPECT_TRUE(input.is_text());
+  EXPECT_EQ(input.get_text(), text);
+
+  // Assign tokens back to text input
+  input = MultimodalInput(tokens);
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_EQ(input.get_tokens(), tokens);
+}
+
+// Test token values with specific patterns
+TEST_F(MultimodalInputTokenTest, SpecificTokenValues) {
+  std::vector<uint64_t> tokens = {
+      0, 1, 2, 65535, 4294967295ULL, 18446744073709551615ULL};
+  MultimodalInput input(tokens);
+
+  EXPECT_TRUE(input.is_tokens());
+  EXPECT_EQ(input.get_tokens().size(), 6);
+  EXPECT_EQ(input.get_tokens()[0], 0);
+  EXPECT_EQ(input.get_tokens()[1], 1);
+  EXPECT_EQ(input.get_tokens()[2], 2);
+  EXPECT_EQ(input.get_tokens()[3], 65535);
+  EXPECT_EQ(input.get_tokens()[4], 4294967295ULL);
+  EXPECT_EQ(input.get_tokens()[5], 18446744073709551615ULL); // Max uint64_t
+}
+
+// Test token modification through reference
+TEST_F(MultimodalInputTokenTest, TokenModificationThroughReference) {
+  std::vector<uint64_t> tokens = createTestTokens();
+  MultimodalInput input(tokens);
+
+  // Get mutable reference and modify
+  std::vector<uint64_t>& token_ref = input.get_tokens();
+  token_ref[0] = 999;
+  token_ref.push_back(1000);
+
+  // Verify changes
+  EXPECT_EQ(input.get_tokens()[0], 999);
+  EXPECT_EQ(input.get_tokens().size(), 6);
+  EXPECT_EQ(input.get_tokens().back(), 1000);
+}
diff --git a/extension/llm/runner/test/test_runner_pybindings.py b/extension/llm/runner/test/test_runner_pybindings.py
new file mode 100644
index 00000000000..f30226bf3e2
--- /dev/null
+++ b/extension/llm/runner/test/test_runner_pybindings.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Unit tests for the ExecuTorch LLM Runner Python bindings.
+
+To run these tests:
+    python -m pytest test_pybindings.py -v
+"""
+
+import os
+import tempfile
+import unittest
+
+import torch
+from executorch.extension.llm.runner import (
+    GenerationConfig,
+    Image,
+    make_image_input,
+    make_text_input,
+    MultimodalInput,
+    MultimodalRunner,
+)
+
+
+class TestGenerationConfig(unittest.TestCase):
+    """Test the GenerationConfig class."""
+
+    def test_default_values(self):
+        """Test that GenerationConfig has correct default values."""
+        config = GenerationConfig()
+
+        # Check defaults based on irunner.h
+        self.assertEqual(config.echo, True)
+        self.assertEqual(config.max_new_tokens, -1)
+        self.assertEqual(config.warming, False)
+        self.assertEqual(config.seq_len, -1)
+        self.assertAlmostEqual(config.temperature, 0.8, places=5)
+        self.assertEqual(config.num_bos, 0)
+        self.assertEqual(config.num_eos, 0)
+
+    def test_set_values(self):
+        """Test setting values on GenerationConfig."""
+        config = GenerationConfig()
+
+        config.echo = False
+        config.max_new_tokens = 100
+        config.warming = True
+        config.seq_len = 512
+        config.temperature = 0.5
+        config.num_bos = 1
+        config.num_eos = 2
+
+        self.assertEqual(config.echo, False)
+        self.assertEqual(config.max_new_tokens, 100)
+        self.assertEqual(config.warming, True)
+        self.assertEqual(config.seq_len, 512)
+        self.assertAlmostEqual(config.temperature, 0.5, places=5)
+        self.assertEqual(config.num_bos, 1)
+        self.assertEqual(config.num_eos, 2)
+
+    def test_resolve_max_new_tokens(self):
+        """Test the resolve_max_new_tokens method."""
+        config = GenerationConfig()
+
+        # Test case 1: Both seq_len and max_new_tokens are -1
+        config.seq_len = -1
+        config.max_new_tokens = -1
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 924)  # 1024 - 100
+
+        # Test case 2: Only max_new_tokens is specified
+        config.seq_len = -1
+        config.max_new_tokens = 200
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 200)  # min(200, 1024-100)
+
+        # Test case 3: Only seq_len is specified
+        config.seq_len = 512
+        config.max_new_tokens = -1
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 412)  # min(512, 1024) - 100
+
+        # Test case 4: Both are specified
+        config.seq_len = 512
+        config.max_new_tokens = 200
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 200)  # min(min(512, 1024) - 100, 200)
+
+        # Test case 5: Result would be negative
+        config.seq_len = 50
+        config.max_new_tokens = -1
+        result = config.resolve_max_new_tokens(1024, 100)
+        self.assertEqual(result, 0)  # max(0, 50 - 100)
+
+    def test_repr(self):
+        """Test the string representation."""
+        config = GenerationConfig()
+        config.max_new_tokens = 100
+        config.seq_len = 512
+        config.temperature = 0.7
+
+        repr_str = repr(config)
+        self.assertIn("GenerationConfig", repr_str)
+        self.assertIn("max_new_tokens=100", repr_str)
+        self.assertIn("seq_len=512", repr_str)
+        self.assertIn("temperature=0.7", repr_str)
+        self.assertIn("echo=True", repr_str)
+        self.assertIn("warming=False", repr_str)
+
+
+class TestImage(unittest.TestCase):
+    """Test the Image class."""
+
+    def test_creation(self):
+        """Test creating an Image object."""
+        # Construct using binding constructor (uint8 data)
+        image = Image([1, 2, 3, 4], 2, 2, 1)
+
+        # Properties are read-only
+        self.assertEqual(image.uint8_data, [1, 2, 3, 4])
+        self.assertEqual(image.width, 2)
+        self.assertEqual(image.height, 2)
+        self.assertEqual(image.channels, 1)
+
+    def test_repr(self):
+        """Test string representation."""
+        image = Image([0] * (480 * 640 * 3), 640, 480, 3)
+
+        repr_str = repr(image)
+        self.assertIn("Image", repr_str)
+        self.assertIn("height=480", repr_str)
+        self.assertIn("width=640", repr_str)
+        self.assertIn("channels=3", repr_str)
+
+
+class TestMultimodalInput(unittest.TestCase):
+    """Test the MultimodalInput class."""
+
+    def test_text_input(self):
+        """Test creating a text MultimodalInput."""
+        # Test direct constructor
+        text_input = MultimodalInput("Hello, world!")
+        self.assertTrue(text_input.is_text())
+        self.assertFalse(text_input.is_image())
+        self.assertEqual(text_input.get_text(), "Hello, world!")
+
+        # Test helper function
+        text_input2 = make_text_input("Test text")
+        self.assertTrue(text_input2.is_text())
+        self.assertEqual(text_input2.get_text(), "Test text")
+
+    def test_image_input(self):
+        """Test creating an image MultimodalInput."""
+        # Create an image
+        image = Image([255] * (100 * 100 * 3), 100, 100, 3)
+
+        # Test direct constructor
+        image_input = MultimodalInput(image)
+        self.assertTrue(image_input.is_image())
+        self.assertFalse(image_input.is_text())
+
+        # Test helper function with torch tensor (CHW)
+        img_tensor = torch.ones((3, 50, 60), dtype=torch.uint8) * 128
+        image_input2 = make_image_input(img_tensor)
+        self.assertTrue(image_input2.is_image())
+        self.assertFalse(image_input2.is_text())
+
+    def test_invalid_image_array(self):
+        """Test error handling for invalid image arrays."""
+        # Wrong dimensions (expects 3D or 4D tensor)
+        with self.assertRaises(RuntimeError) as cm:
+            make_image_input(torch.ones((100,), dtype=torch.uint8))
+        self.assertIn("3-dimensional", str(cm.exception))
+
+        # Wrong number of channels
+        with self.assertRaises(RuntimeError) as cm:
+            make_image_input(torch.ones((2, 100, 100), dtype=torch.uint8))
+        self.assertIn("3 (RGB) or 4 (RGBA)", str(cm.exception))
+
+    def test_repr(self):
+        """Test string representation."""
+        # Text input
+        text_input = MultimodalInput("This is a test")
+        repr_str = repr(text_input)
+        self.assertIn("MultimodalInput", repr_str)
+        self.assertIn("type=text", repr_str)
+        self.assertIn("This is a test", repr_str)
+
+        # Long text input (should be truncated)
+        long_text = "a" * 100
+        text_input2 = MultimodalInput(long_text)
+        repr_str2 = repr(text_input2)
+        self.assertIn("...", repr_str2)
+
+        # Image input
+        image = Image([0, 0, 0], 1, 1, 3)
+        image_input = MultimodalInput(image)
+        repr_str3 = repr(image_input)
+        self.assertIn("type=image", repr_str3)
+
+
+class TestMultimodalRunner(unittest.TestCase):
+    """Test the MultimodalRunner class."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        # Create temporary files for testing
+        self.temp_dir = tempfile.mkdtemp()
+        self.model_path = os.path.join(self.temp_dir, "model.pte")
+        self.tokenizer_path = os.path.join(self.temp_dir, "tokenizer.bin")
+
+        # Create dummy files (these won't actually work, but we can test initialization failure)
+        with open(self.model_path, "wb") as f:
+            f.write(b"dummy model")
+        with open(self.tokenizer_path, "wb") as f:
+            f.write(b"dummy tokenizer")
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        import shutil
+
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_initialization_failure(self):
+        """Test that initialization fails gracefully with invalid files."""
+        with self.assertRaises(RuntimeError) as cm:
+            MultimodalRunner(self.model_path, self.tokenizer_path, None)
+        # Should fail because the tokenizer file is not valid
+        self.assertIn("Failed to", str(cm.exception))
+
+
+class TestHelperFunctions(unittest.TestCase):
+    """Test helper functions."""
+
+    def test_make_text_input(self):
+        """Test make_text_input helper."""
+        text_input = make_text_input("Hello")
+        self.assertTrue(text_input.is_text())
+        self.assertEqual(text_input.get_text(), "Hello")
+
+    def test_make_image_input(self):
+        """Test make_image_input helper."""
+        # Create a test image tensor (RGB, CHW)
+        img_tensor = torch.zeros((3, 100, 150), dtype=torch.uint8)
+        img_tensor[0, :, :] = 255  # Red channel
+
+        image_input = make_image_input(img_tensor)
+        self.assertTrue(image_input.is_image())
+
+        # Test with RGBA (CHW)
+        img_tensor_rgba = torch.ones((4, 50, 50), dtype=torch.uint8) * 128
+        image_input_rgba = make_image_input(img_tensor_rgba)
+        self.assertTrue(image_input_rgba.is_image())
diff --git a/extension/llm/runner/test/test_util.cpp b/extension/llm/runner/test/test_util.cpp
new file mode 100644
index 00000000000..242e48e6871
--- /dev/null
+++ b/extension/llm/runner/test/test_util.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/util.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+namespace {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::extension::make_tensor_ptr;
+using ::executorch::extension::llm::convert_to_bfloat16;
+
+TEST(ConvertToBFloat16Test, ConvertsFloatTensorData) {
+  auto source_tensor = make_tensor_ptr<float>(
+      {2, 2}, std::vector<float>{0.0f, 1.5f, -2.0f, 3.25f});
+
+  auto result = convert_to_bfloat16(source_tensor);
+  ASSERT_TRUE(result.ok());
+  auto bf16_tensor = *result;
+
+  EXPECT_EQ(bf16_tensor->scalar_type(), ScalarType::BFloat16);
+  EXPECT_EQ(bf16_tensor->numel(), source_tensor->numel());
+
+  auto src_sizes = source_tensor->sizes();
+  auto dst_sizes = bf16_tensor->sizes();
+  ASSERT_EQ(dst_sizes.size(), src_sizes.size());
+  for (size_t dim = 0; dim < dst_sizes.size(); ++dim) {
+    EXPECT_EQ(dst_sizes[dim], src_sizes[dim]);
+  }
+
+  const auto* converted_data = bf16_tensor->const_data_ptr<::c10::BFloat16>();
+  const auto* original_data = source_tensor->const_data_ptr<float>();
+  ASSERT_NE(converted_data, nullptr);
+  ASSERT_NE(original_data, nullptr);
+
+  for (size_t i = 0; i < static_cast<size_t>(source_tensor->numel()); ++i) {
+    EXPECT_NEAR(static_cast<float>(converted_data[i]), original_data[i], 1e-2f);
+  }
+}
+
+TEST(ConvertToBFloat16Test, RejectsNonFloatTensor) {
+  auto non_float_tensor =
+      make_tensor_ptr<int64_t>({3}, std::vector<int64_t>{1, 2, 3});
+
+  auto result = convert_to_bfloat16(non_float_tensor);
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.error(), ::executorch::runtime::Error::InvalidArgument);
+}
+
+} // namespace
diff --git a/extension/llm/runner/test/test_wav_loader.cpp b/extension/llm/runner/test/test_wav_loader.cpp
new file mode 100644
index 00000000000..bc3ac0ff324
--- /dev/null
+++ b/extension/llm/runner/test/test_wav_loader.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/llm/runner/wav_loader.h>
+#include <executorch/extension/testing_util/temp_file.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+using executorch::extension::llm::kOneOverIntMax;
+using executorch::extension::llm::kOneOverShortMax;
+using executorch::extension::llm::load_wav_audio_data;
+using executorch::extension::llm::load_wav_header;
+using executorch::extension::llm::WavHeader;
+using executorch::extension::testing::TempFile;
+
+namespace {
+
+// Test fixture to ensure PAL initialization
+class WavLoaderTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Ensure PAL is initialized before tests run
+    executorch::runtime::runtime_init();
+  }
+};
+
+void append_bytes(std::vector<uint8_t>& out, const char* literal) {
+  out.insert(out.end(), literal, literal + 4);
+}
+
+void append_le16(std::vector<uint8_t>& out, uint16_t value) {
+  out.push_back(static_cast<uint8_t>(value & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 8) & 0xFF));
+}
+
+void append_le32(std::vector<uint8_t>& out, uint32_t value) {
+  out.push_back(static_cast<uint8_t>(value & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 8) & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 16) & 0xFF));
+  out.push_back(static_cast<uint8_t>((value >> 24) & 0xFF));
+}
+
+std::vector<uint8_t> make_pcm_wav_bytes(
+    int bits_per_sample,
+    const std::vector<int32_t>& samples,
+    uint16_t num_channels = 1,
+    uint32_t sample_rate = 16000) {
+  const size_t bytes_per_sample = static_cast<size_t>(bits_per_sample / 8);
+  const uint32_t subchunk2_size =
+      static_cast<uint32_t>(samples.size() * bytes_per_sample);
+  const uint32_t byte_rate = sample_rate * num_channels * bytes_per_sample;
+  const uint16_t block_align = num_channels * bytes_per_sample;
+  const uint32_t chunk_size = 36 + subchunk2_size;
+
+  std::vector<uint8_t> bytes;
+  bytes.reserve(44 + subchunk2_size);
+
+  append_bytes(bytes, "RIFF");
+  append_le32(bytes, chunk_size);
+  append_bytes(bytes, "WAVE");
+  append_bytes(bytes, "fmt ");
+  append_le32(bytes, 16); // PCM
+  append_le16(bytes, 1); // AudioFormat PCM
+  append_le16(bytes, num_channels);
+  append_le32(bytes, sample_rate);
+  append_le32(bytes, byte_rate);
+  append_le16(bytes, block_align);
+  append_le16(bytes, static_cast<uint16_t>(bits_per_sample));
+  append_bytes(bytes, "data");
+  append_le32(bytes, subchunk2_size);
+
+  for (int32_t sample : samples) {
+    const uint32_t encoded =
+        static_cast<uint32_t>(static_cast<int32_t>(sample));
+    for (size_t byte_idx = 0; byte_idx < bytes_per_sample; ++byte_idx) {
+      bytes.push_back(static_cast<uint8_t>((encoded >> (8 * byte_idx)) & 0xFF));
+    }
+  }
+
+  return bytes;
+}
+
+} // namespace
+
+TEST_F(WavLoaderTest, LoadHeaderParsesPcmMetadata) {
+  const std::vector<uint8_t> wav_bytes =
+      make_pcm_wav_bytes(16, {0, 32767, -32768});
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::unique_ptr<WavHeader> header = load_wav_header(file.path());
+  ASSERT_NE(header, nullptr);
+
+  EXPECT_EQ(header->AudioFormat, 1);
+  EXPECT_EQ(header->NumOfChan, 1);
+  EXPECT_EQ(header->SamplesPerSec, 16000);
+  EXPECT_EQ(header->bitsPerSample, 16);
+  EXPECT_EQ(header->blockAlign, 2);
+  EXPECT_EQ(header->bytesPerSec, 32000);
+  EXPECT_EQ(header->dataOffset, 44);
+  EXPECT_EQ(header->Subchunk2Size, 6);
+}
+
+TEST_F(WavLoaderTest, LoadAudioData16BitNormalizesSamples) {
+  const std::vector<int32_t> samples = {0, 32767, -32768};
+  const std::vector<uint8_t> wav_bytes = make_pcm_wav_bytes(16, samples);
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::vector<float> audio = load_wav_audio_data(file.path());
+  ASSERT_EQ(audio.size(), samples.size());
+
+  EXPECT_NEAR(audio[0], 0.0f, 1e-6f);
+  EXPECT_NEAR(audio[1], 32767.0f * kOneOverShortMax, 1e-6f);
+  EXPECT_NEAR(audio[2], -32768.0f * kOneOverShortMax, 1e-6f);
+}
+
+TEST_F(WavLoaderTest, LoadAudioData32BitNormalizesSamples) {
+  const std::vector<int32_t> samples = {
+      0,
+      std::numeric_limits<int32_t>::max(),
+      std::numeric_limits<int32_t>::min()};
+  const std::vector<uint8_t> wav_bytes = make_pcm_wav_bytes(32, samples);
+  TempFile file(wav_bytes.data(), wav_bytes.size());
+
+  std::vector<float> audio = load_wav_audio_data(file.path());
+  ASSERT_EQ(audio.size(), samples.size());
+
+  EXPECT_NEAR(audio[0], 0.0f, 1e-8f);
+  EXPECT_NEAR(
+      audio[1],
+      static_cast<float>(static_cast<double>(samples[1]) * kOneOverIntMax),
+      1e-6f);
+  EXPECT_NEAR(
+      audio[2],
+      static_cast<float>(static_cast<double>(samples[2]) * kOneOverIntMax),
+      1e-6f);
+}
+
+TEST_F(WavLoaderTest, LoadHeaderReturnsNullWhenMagicMissing) {
+  const std::string bogus_contents = "not a wav file";
+  TempFile file(bogus_contents);
+
+  std::unique_ptr<WavHeader> header = load_wav_header(file.path());
+  EXPECT_EQ(header, nullptr);
+}
diff --git a/extension/llm/runner/util.h b/extension/llm/runner/util.h
index 513fd109255..ec08ecfb647 100644
--- a/extension/llm/runner/util.h
+++ b/extension/llm/runner/util.h
@@ -121,10 +121,6 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
   auto second_input_sizes = second_input_info.sizes();
   auto numel = second_input_sizes[0];
 
-  for (int i = 0; i < second_input_sizes.size(); ++i) {
-    ET_LOG(Error, "second_input_sizes[%d] = %d", i, second_input_sizes[i]);
-  }
-
   TensorPtr start_pos_tensor;
   if (numel > 1) {
     // `cache_position` goes from start_pos to start_pos +
@@ -145,6 +141,31 @@ inline runtime::Result<TensorPtr> populate_start_pos_or_cache_position(
   }
 }
 
+/**
+ * Helper function to convert a float tensor to bfloat16.
+ * Creates a new tensor with bfloat16 dtype and copies/converts the data.
+ */
+inline ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+convert_to_bfloat16(const ::executorch::extension::TensorPtr& src_tensor) {
+  ET_CHECK_OR_RETURN_ERROR(
+      src_tensor->scalar_type() == ::executorch::aten::ScalarType::Float,
+      InvalidArgument,
+      "BFloat16 conversion only supported from Float source data");
+
+  const auto num_elements = static_cast<size_t>(src_tensor->numel());
+  const float* float_data = src_tensor->const_data_ptr<float>();
+
+  auto bf16_tensor = ::executorch::extension::empty_like(
+      src_tensor, ::executorch::aten::ScalarType::BFloat16);
+  auto* bf16_data =
+      bf16_tensor->mutable_data_ptr<::executorch::aten::BFloat16>();
+  for (size_t i = 0; i < num_elements; ++i) {
+    bf16_data[i] = ::executorch::aten::BFloat16(float_data[i]);
+  }
+
+  return bf16_tensor;
+}
+
 } // namespace llm
 } // namespace extension
 } // namespace executorch
diff --git a/extension/llm/runner/wav_loader.h b/extension/llm/runner/wav_loader.h
new file mode 100644
index 00000000000..f49a4d1723e
--- /dev/null
+++ b/extension/llm/runner/wav_loader.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple WAV file loader.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch::extension::llm {
+
+constexpr float kOneOverIntMax = 1 / static_cast<float>(INT32_MAX);
+constexpr float kOneOverShortMax = 1 / static_cast<float>(INT16_MAX);
+
+struct WavHeader {
+  /* RIFF Chunk Descriptor */
+  uint8_t RIFF[4];
+  uint32_t ChunkSize;
+  uint8_t WAVE[4];
+  /* "fmt" sub-chunk */
+  uint8_t fmt[4];
+  uint32_t Subchunk1Size;
+  uint16_t AudioFormat;
+  uint16_t NumOfChan;
+  uint32_t SamplesPerSec;
+  uint32_t bytesPerSec;
+  uint16_t blockAlign;
+  uint16_t bitsPerSample;
+  /* "data" sub-chunk */
+  uint32_t dataOffset;
+  uint32_t Subchunk2Size;
+};
+
+inline std::unique_ptr<WavHeader> load_wav_header(const std::string& fp) {
+  std::ifstream file(fp, std::ios::binary);
+  if (!file.is_open()) {
+    ET_CHECK_MSG(false, "Failed to open WAV file: %s", fp.c_str());
+  }
+
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(file_size);
+  file.read(buffer.data(), file_size);
+  file.close();
+
+  const char* data = buffer.data();
+  size_t data_size = buffer.size();
+
+  bool has_riff = false;
+  bool has_wave = false;
+
+  if (data_size >= 4 && std::memcmp(data, "RIFF", 4) == 0) {
+    has_riff = true;
+  }
+
+  if (data_size >= 12 && std::memcmp(data + 8, "WAVE", 4) == 0) {
+    has_wave = true;
+  }
+
+  bool is_wav_file = has_riff && has_wave;
+  std::unique_ptr<WavHeader> header;
+
+  if (is_wav_file) {
+    header = std::make_unique<WavHeader>();
+    size_t default_header_size = sizeof(WavHeader);
+
+    size_t data_offset = 0;
+    for (size_t i = 0; i + 4 < data_size; i++) {
+      if (std::memcmp(data + i, "data", 4) == 0) {
+        data_offset = i;
+        break;
+      }
+    }
+
+    if (data_size >= default_header_size) {
+      std::memcpy(
+          reinterpret_cast<char*>(header.get()), data, default_header_size);
+
+      ET_LOG(Info, "WAV header detected, getting raw audio data.");
+      ET_LOG(
+          Info,
+          "RIFF Header: %c%c%c%c",
+          header->RIFF[0],
+          header->RIFF[1],
+          header->RIFF[2],
+          header->RIFF[3]);
+      ET_LOG(Info, "Chunk Size: %d", header->ChunkSize);
+      ET_LOG(
+          Info,
+          "WAVE Header: %c%c%c%c",
+          header->WAVE[0],
+          header->WAVE[1],
+          header->WAVE[2],
+          header->WAVE[3]);
+      ET_LOG(
+          Info,
+          "Format Header: %c%c%c%c",
+          header->fmt[0],
+          header->fmt[1],
+          header->fmt[2],
+          header->fmt[3]);
+      ET_LOG(Info, "Format Chunk Size: %d", header->Subchunk1Size);
+      ET_LOG(Info, "Audio Format: %d", header->AudioFormat);
+      ET_LOG(Info, "Number of Channels: %d", header->NumOfChan);
+      ET_LOG(Info, "Sample Rate: %d", header->SamplesPerSec);
+      ET_LOG(Info, "Byte Rate: %d", header->bytesPerSec);
+      ET_LOG(Info, "Block Align: %d", header->blockAlign);
+      ET_LOG(Info, "Bits per Sample: %d", header->bitsPerSample);
+
+      if (data_offset != 0) {
+        header->Subchunk2Size =
+            *reinterpret_cast<const int32_t*>(data + data_offset + 4);
+        ET_LOG(Info, "Subchunk2Size: %d", header->Subchunk2Size);
+        header->dataOffset = static_cast<uint32_t>(data_offset + 8);
+      } else {
+        ET_LOG(
+            Error,
+            "WAV file structure is invalid, missing Subchunk2ID 'data' field.");
+        throw std::runtime_error("Invalid WAV file structure");
+      }
+    } else {
+      ET_CHECK_MSG(
+          false,
+          "WAV header detected but file is too small to contain a complete header");
+    }
+  }
+
+  return header;
+}
+
+inline std::vector<float> load_wav_audio_data(const std::string& fp) {
+  std::ifstream file(fp, std::ios::binary);
+  if (!file.is_open()) {
+    ET_CHECK_MSG(false, "Failed to open WAV file: %s", fp.c_str());
+  }
+
+  file.seekg(0, std::ios::end);
+  size_t file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  std::vector<char> buffer(file_size);
+  file.read(buffer.data(), file_size);
+  file.close();
+
+  auto header = load_wav_header(fp);
+
+  if (header.get() == nullptr) {
+    ET_CHECK_MSG(false, "WAV header not detected in file: %s", fp.c_str());
+  }
+
+  const char* data = buffer.data();
+  size_t data_offset = header->dataOffset;
+  size_t data_size = header->Subchunk2Size;
+  int bits_per_sample = header->bitsPerSample;
+
+  std::vector<float> audio_data;
+
+  if (bits_per_sample == 32) {
+    size_t num_samples = data_size / 4;
+    audio_data.resize(num_samples);
+    const int32_t* input_buffer =
+        reinterpret_cast<const int32_t*>(data + data_offset);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      audio_data[i] = static_cast<float>(
+          static_cast<double>(input_buffer[i]) * kOneOverIntMax);
+    }
+  } else if (bits_per_sample == 16) {
+    size_t num_samples = data_size / 2;
+    audio_data.resize(num_samples);
+    const int16_t* input_buffer =
+        reinterpret_cast<const int16_t*>(data + data_offset);
+
+    for (size_t i = 0; i < num_samples; ++i) {
+      audio_data[i] = static_cast<float>(
+          static_cast<double>(input_buffer[i]) * kOneOverShortMax);
+    }
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unsupported bits per sample: %d. Only support 32 and 16.",
+        bits_per_sample);
+  }
+
+  ET_LOG(
+      Info,
+      "Loaded %zu audio samples from WAV file: %s",
+      audio_data.size(),
+      fp.c_str());
+
+  return audio_data;
+}
+
+} // namespace executorch::extension::llm
diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 4ed91cc545e..d710a0cf10c 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 4ed91cc545e9ed7098e53747656eb7eff24eb305
+Subproject commit d710a0cf10cfa8cb7ffda33c4e61af63119bc95f
diff --git a/extension/module/CMakeLists.txt b/extension/module/CMakeLists.txt
index 5f114f1befa..4e1c3f160bd 100644
--- a/extension/module/CMakeLists.txt
+++ b/extension/module/CMakeLists.txt
@@ -29,34 +29,39 @@ else()
 endif()
 target_link_libraries(
   extension_module PRIVATE executorch_core extension_data_loader
-                           extension_flat_tensor
+                           extension_flat_tensor extension_named_data_map
 )
 target_include_directories(
   extension_module PUBLIC ${_common_include_directories}
 )
 target_compile_options(
-  extension_module PUBLIC -Wno-deprecated-declarations -fPIC
+  extension_module
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
 )
 
 # Module extension built as a static library. TODO(gjcomer) Remove this target
 # after cleaning up CMake targets.
 add_library(extension_module_static STATIC ${_extension_module__srcs})
 target_link_libraries(
-  extension_module_static PRIVATE executorch_core extension_data_loader
-                                  extension_flat_tensor
+  extension_module_static
+  PRIVATE executorch_core extension_data_loader extension_flat_tensor
+          extension_named_data_map
 )
 target_include_directories(
   extension_module_static PUBLIC ${_common_include_directories}
 )
 target_compile_options(
-  extension_module_static PUBLIC -Wno-deprecated-declarations -fPIC
+  extension_module_static
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/wd4996>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations -fPIC>
 )
 
 # Install libraries
 install(
   TARGETS extension_module extension_module_static
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
index 4b82dbf4954..9de77bcbc79 100644
--- a/extension/module/module.cpp
+++ b/extension/module/module.cpp
@@ -12,6 +12,7 @@
 #include <executorch/extension/data_loader/mmap_data_loader.h>
 #include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
+#include <executorch/extension/named_data_map/merged_data_map.h>
 #include <executorch/runtime/platform/runtime.h>
 
 /**
@@ -38,6 +39,7 @@ namespace executorch {
 namespace extension {
 namespace ET_MODULE_NAMESPACE {
 
+using ET_MERGED_DATA_MAP_NAMESPACE::MergedDataMap;
 using ET_RUNTIME_NAMESPACE::MethodMeta;
 using ET_RUNTIME_NAMESPACE::Program;
 
@@ -75,9 +77,7 @@ Module::Module(
       load_mode_(load_mode),
       memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
       temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)),
-      data_map_loader_(nullptr),
-      data_map_(nullptr) {
+      event_tracer_(std::move(event_tracer)) {
   runtime::runtime_init();
 }
 
@@ -87,13 +87,27 @@ Module::Module(
     const LoadMode load_mode,
     std::unique_ptr<runtime::EventTracer> event_tracer)
     : file_path_(file_path),
-      data_map_path_(data_map_path),
       load_mode_(load_mode),
       memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
       temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)),
-      data_map_loader_(nullptr),
-      data_map_(nullptr) {
+      event_tracer_(std::move(event_tracer)) {
+  if (!data_map_path.empty()) {
+    data_files_.push_back(data_map_path);
+  }
+  runtime::runtime_init();
+}
+
+Module::Module(
+    const std::string& file_path,
+    std::vector<std::string> data_files,
+    const LoadMode load_mode,
+    std::unique_ptr<runtime::EventTracer> event_tracer)
+    : file_path_(file_path),
+      data_files_(std::move(data_files)),
+      load_mode_(load_mode),
+      memory_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      temp_allocator_(std::make_unique<MallocMemoryAllocator>()),
+      event_tracer_(std::move(event_tracer)) {
   runtime::runtime_init();
 }
 
@@ -110,9 +124,10 @@ Module::Module(
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
                          : std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)),
-      data_map_loader_(std::move(data_map_loader)),
-      data_map_(nullptr) {
+      event_tracer_(std::move(event_tracer)) {
+  if (data_map_loader) {
+    data_map_loaders_.push_back(std::move(data_map_loader));
+  }
   runtime::runtime_init();
 }
 
@@ -129,9 +144,10 @@ Module::Module(
       temp_allocator_(
           temp_allocator ? std::move(temp_allocator)
                          : std::make_unique<MallocMemoryAllocator>()),
-      event_tracer_(std::move(event_tracer)),
-      data_map_loader_(std::move(data_map_loader)),
-      data_map_(nullptr) {
+      event_tracer_(std::move(event_tracer)) {
+  if (data_map_loader) {
+    data_map_loaders_.push_back(std::move(data_map_loader));
+  }
   runtime::runtime_init();
 }
 
@@ -140,14 +156,30 @@ runtime::Error Module::load(const Program::Verification verification) {
     if (!data_loader_) {
       data_loader_ = ET_UNWRAP(make_data_loader(file_path_, load_mode_));
     }
-    if (!data_map_path_.empty()) {
-      data_map_loader_ =
-          ET_UNWRAP(make_data_loader(data_map_path_, load_mode_));
+    if (data_files_.size() > 0) {
+      for (const auto& data_file : data_files_) {
+        data_map_loaders_.push_back(
+            ET_UNWRAP(make_data_loader(data_file, load_mode_)));
+      }
     }
-    if (data_map_loader_) {
-      data_map_ =
-          ET_UNWRAP_UNIQUE(FlatTensorDataMap::load(data_map_loader_.get()));
+
+    if (data_map_loaders_.size() > 0) {
+      for (auto i = 0; i < data_map_loaders_.size(); ++i) {
+        named_data_maps_.push_back(ET_UNWRAP_UNIQUE(
+            FlatTensorDataMap::load(data_map_loaders_[i].get())));
+      }
+
+      // Extract raw pointers from unique_ptrs to pass to MergedDataMap::load()
+      std::vector<const NamedDataMap*> raw_data_maps;
+      raw_data_maps.reserve(named_data_maps_.size());
+      for (const auto& data_map : named_data_maps_) {
+        raw_data_maps.push_back(data_map.get());
+      }
+      merged_data_map_ = ET_UNWRAP_UNIQUE(
+          MergedDataMap::load(runtime::Span<const NamedDataMap*>(
+              raw_data_maps.data(), raw_data_maps.size())));
     }
+
     auto program =
         ET_UNWRAP_UNIQUE(Program::load(data_loader_.get(), verification));
     program_ = std::shared_ptr<Program>(
@@ -209,7 +241,7 @@ runtime::Error Module::load_method(
         method_name.c_str(),
         method_holder.memory_manager.get(),
         event_tracer ? event_tracer : this->event_tracer(),
-        data_map_.get()));
+        merged_data_map_.get()));
     methods_.emplace(method_name, std::move(method_holder));
   }
   return runtime::Error::Ok;
diff --git a/extension/module/module.h b/extension/module/module.h
index 58ff3ada720..207de768991 100644
--- a/extension/module/module.h
+++ b/extension/module/module.h
@@ -70,7 +70,7 @@ class Module {
    * memory locking behavior.
    *
    * @param[in] file_path The path to the ExecuTorch program file to load.
-   * @param[in] data_map_path The path to a .ptd file
+   * @param[in] data_map_path The path to a .ptd file.
    * @param[in] load_mode The loading mode to use.
    * @param[in] event_tracer A EventTracer used for tracking and logging events.
    */
@@ -80,6 +80,21 @@ class Module {
       const LoadMode load_mode = LoadMode::File,
       std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
 
+  /**
+   * Constructs an instance by loading a program from a file with specified
+   * memory locking behavior.
+   *
+   * @param[in] file_path The path to the ExecuTorch program file to load.
+   * @param[in] data_files The path to one or more .ptd file/s.
+   * @param[in] load_mode The loading mode to use.
+   * @param[in] event_tracer A EventTracer used for tracking and logging events.
+   */
+  explicit Module(
+      const std::string& file_path,
+      std::vector<std::string> data_files,
+      const LoadMode load_mode = LoadMode::File,
+      std::unique_ptr<runtime::EventTracer> event_tracer = nullptr);
+
   /**
    * Constructs an instance with the provided data loader and memory allocator.
    *
@@ -614,15 +629,16 @@ class Module {
   };
 
   std::string file_path_;
-  std::string data_map_path_;
+  std::vector<std::string> data_files_;
   LoadMode load_mode_{LoadMode::File};
   std::shared_ptr<Program> program_;
   std::unique_ptr<runtime::DataLoader> data_loader_;
   std::unique_ptr<runtime::MemoryAllocator> memory_allocator_;
   std::unique_ptr<runtime::MemoryAllocator> temp_allocator_;
   std::unique_ptr<runtime::EventTracer> event_tracer_;
-  std::unique_ptr<runtime::DataLoader> data_map_loader_;
-  std::unique_ptr<NamedDataMap> data_map_;
+  std::vector<std::unique_ptr<runtime::DataLoader>> data_map_loaders_;
+  std::vector<std::unique_ptr<NamedDataMap>> named_data_maps_;
+  std::unique_ptr<NamedDataMap> merged_data_map_;
   ET_DEPRECATED std::vector<uint8_t> debug_buffer_;
 
  protected:
diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl
index 3e449da5e14..0db909ce053 100644
--- a/extension/module/targets.bzl
+++ b/extension/module/targets.bzl
@@ -26,6 +26,7 @@ def define_common_targets():
                 "//executorch/extension/data_loader:file_data_loader",
                 "//executorch/extension/data_loader:mmap_data_loader",
                 "//executorch/extension/flat_tensor:flat_tensor_data_map" + aten_suffix,
+                "//executorch/extension/named_data_map:merged_data_map" + aten_suffix,
             ],
             exported_deps = [
                 "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
diff --git a/extension/module/test/CMakeLists.txt b/extension/module/test/CMakeLists.txt
index 1c4358dd73e..54ace17557f 100644
--- a/extension/module/test/CMakeLists.txt
+++ b/extension/module/test/CMakeLists.txt
@@ -23,11 +23,14 @@ add_custom_command(
   OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
          "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
   COMMAND ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
           "ModuleAdd" --outdir "${CMAKE_CURRENT_BINARY_DIR}"
   COMMAND
-    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules "ModuleAddMul"
-    --external-constants --outdir "${CMAKE_CURRENT_BINARY_DIR}"
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAddMul,ModuleLinear" --external-constants --outdir
+    "${CMAKE_CURRENT_BINARY_DIR}"
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
 )
 
@@ -36,12 +39,16 @@ add_custom_target(
   DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
           "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 set(test_env
     "ET_MODULE_ADD_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAdd.pte"
     "ET_MODULE_ADD_MUL_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.pte"
     "ET_MODULE_ADD_MUL_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+    "ET_MODULE_LINEAR_PROGRAM_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.pte"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
 )
 
 et_cxx_test(
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
index 1c9fc5628ba..27332503cad 100644
--- a/extension/module/test/module_test.cpp
+++ b/extension/module/test/module_test.cpp
@@ -26,11 +26,15 @@ class ModuleTest : public ::testing::Test {
     model_path_ = std::getenv("ET_MODULE_ADD_PATH");
     add_mul_path_ = std::getenv("ET_MODULE_ADD_MUL_PROGRAM_PATH");
     add_mul_data_path_ = std::getenv("ET_MODULE_ADD_MUL_DATA_PATH");
+    linear_path_ = std::getenv("ET_MODULE_LINEAR_PROGRAM_PATH");
+    linear_data_path_ = std::getenv("ET_MODULE_LINEAR_DATA_PATH");
   }
 
   static inline std::string model_path_;
   static inline std::string add_mul_path_;
   static inline std::string add_mul_data_path_;
+  static inline std::string linear_path_;
+  static inline std::string linear_data_path_;
 };
 
 TEST_F(ModuleTest, TestLoad) {
@@ -530,3 +534,23 @@ TEST_F(ModuleTest, TestPTD) {
   auto tensor = make_tensor_ptr({2, 2}, {2.f, 3.f, 4.f, 2.f});
   ASSERT_EQ(module.forward(tensor).error(), Error::Ok);
 }
+
+TEST_F(ModuleTest, TestPTD_Multiple) {
+  std::vector<std::string> data_files = {add_mul_data_path_, linear_data_path_};
+
+  // Create module with add mul.
+  Module module_add_mul(add_mul_path_, data_files);
+  ASSERT_EQ(module_add_mul.load_method("forward"), Error::Ok);
+  auto tensor = make_tensor_ptr({2, 2}, {2.f, 3.f, 4.f, 2.f});
+  ASSERT_EQ(module_add_mul.forward(tensor).error(), Error::Ok);
+
+  // Confirm that the data_file is not std::move'd away.
+  ASSERT_EQ(std::strcmp(data_files[0].c_str(), add_mul_data_path_.c_str()), 0);
+  ASSERT_EQ(std::strcmp(data_files[1].c_str(), linear_data_path_.c_str()), 0);
+
+  // Create module with linear.
+  Module module_linear(linear_path_, data_files);
+  ASSERT_EQ(module_linear.load_method("forward"), Error::Ok);
+  auto tensor2 = make_tensor_ptr({3}, {2.f, 3.f, 4.f});
+  ASSERT_EQ(module_linear.forward(tensor2).error(), Error::Ok);
+}
diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl
index e09b43e356d..da7f1cc91bd 100644
--- a/extension/module/test/targets.bzl
+++ b/extension/module/test/targets.bzl
@@ -19,6 +19,9 @@ def define_common_targets(is_fbcode=False):
             "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])",
             "ET_MODULE_ADD_MUL_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.pte])",
             "ET_MODULE_ADD_MUL_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.ptd])",
+            "ET_MODULE_LINEAR_PROGRAM_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.pte])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
+            "ET_MODULE_SHARED_STATE": "$(location fbcode//executorch/test/models:exported_programs[ModuleSharedState.pte])",
         }
 
         for aten_mode in get_aten_mode_options():
diff --git a/extension/named_data_map/CMakeLists.txt b/extension/named_data_map/CMakeLists.txt
new file mode 100644
index 00000000000..a4ad208c7e2
--- /dev/null
+++ b/extension/named_data_map/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please format this file by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+list(TRANSFORM _extension_named_data_map__srcs PREPEND "${EXECUTORCH_ROOT}/")
+# Create the library
+add_library(extension_named_data_map ${_extension_named_data_map__srcs})
+
+# Link dependencies
+target_link_libraries(extension_named_data_map PUBLIC executorch_core)
+
+target_include_directories(
+  extension_named_data_map PUBLIC ${_common_include_directories}
+)
+
+target_compile_options(
+  extension_named_data_map PUBLIC ${_common_compile_options}
+)
+
+# Install libraries
+install(
+  TARGETS extension_named_data_map
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
+
+# Add tests if testing is enabled
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/extension/named_data_map/TARGETS b/extension/named_data_map/TARGETS
new file mode 100644
index 00000000000..2341af9282f
--- /dev/null
+++ b/extension/named_data_map/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/extension/named_data_map/merged_data_map.cpp b/extension/named_data_map/merged_data_map.cpp
new file mode 100644
index 00000000000..2d1bb7d6158
--- /dev/null
+++ b/extension/named_data_map/merged_data_map.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/data_loader.h>
+
+#include <unordered_map>
+#include <vector>
+
+using executorch::aten::string_view;
+using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
+using executorch::ET_RUNTIME_NAMESPACE::TensorLayout;
+using executorch::runtime::Error;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+
+namespace executorch::extension {
+namespace ET_MERGED_DATA_MAP_NAMESPACE {
+/*static*/ Result<MergedDataMap> MergedDataMap::load(
+    Span<const NamedDataMap*> named_data_maps) {
+  std::vector<const NamedDataMap*> valid_data_maps;
+  for (auto i : c10::irange(named_data_maps.size())) {
+    if (named_data_maps[i] != nullptr &&
+        named_data_maps[i]->get_num_keys().get() > 0) {
+      valid_data_maps.push_back(named_data_maps[i]);
+    }
+  }
+  ET_CHECK_OR_RETURN_ERROR(
+      !valid_data_maps.empty(),
+      InvalidArgument,
+      "No non-empty named data maps provided to merge");
+
+  // Check for duplicate keys.
+  std::unordered_map<std::string, uint32_t> key_to_map_index;
+  for (const uint32_t i : c10::irange(valid_data_maps.size())) {
+    const auto cur_map = valid_data_maps[i];
+    uint32_t num_keys = cur_map->get_num_keys().get();
+    for (auto j : c10::irange(num_keys)) {
+      const auto cur_key = cur_map->get_key(j).get();
+      const auto [it, inserted] = key_to_map_index.emplace(cur_key, i);
+      ET_CHECK_OR_RETURN_ERROR(
+          inserted,
+          InvalidArgument,
+          "Duplicate key %s in named data maps at index %u and %" PRIu32,
+          cur_key,
+          it->second,
+          i);
+    }
+  }
+  return MergedDataMap(std::move(valid_data_maps), std::move(key_to_map_index));
+}
+
+ET_NODISCARD Result<const TensorLayout> MergedDataMap::get_tensor_layout(
+    string_view key) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+
+  return named_data_maps_.at(it->second)->get_tensor_layout(key);
+}
+
+ET_NODISCARD
+Result<FreeableBuffer> MergedDataMap::get_data(string_view key) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+  return named_data_maps_.at(it->second)->get_data(key);
+}
+
+ET_NODISCARD Error MergedDataMap::load_data_into(
+    string_view key,
+    void* buffer,
+    size_t size) const {
+  const auto it = key_to_map_index_.find(key.data());
+  ET_CHECK_OR_RETURN_ERROR(
+      it != key_to_map_index_.end(),
+      NotFound,
+      "Key %s not found in named data maps",
+      key.data());
+  return named_data_maps_.at(it->second)->load_data_into(key, buffer, size);
+}
+
+ET_NODISCARD Result<uint32_t> MergedDataMap::get_num_keys() const {
+  return key_to_map_index_.size();
+}
+
+ET_NODISCARD Result<const char*> MergedDataMap::get_key(uint32_t index) const {
+  uint32_t total_num_keys = get_num_keys().get();
+  ET_CHECK_OR_RETURN_ERROR(
+      index < total_num_keys,
+      InvalidArgument,
+      "Index %u out of range of size %u",
+      index,
+      total_num_keys);
+  for (auto i : c10::irange(named_data_maps_.size())) {
+    auto num_keys = named_data_maps_[i]->get_num_keys().get();
+    if (index < num_keys) {
+      return named_data_maps_[i]->get_key(index);
+    }
+    index -= num_keys;
+  }
+  // Shouldn't reach here.
+  return Error::Internal;
+}
+
+} // namespace ET_MERGED_DATA_MAP_NAMESPACE
+} // namespace executorch::extension
diff --git a/extension/named_data_map/merged_data_map.h b/extension/named_data_map/merged_data_map.h
new file mode 100644
index 00000000000..42490ec3d58
--- /dev/null
+++ b/extension/named_data_map/merged_data_map.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/named_data_map.h>
+
+#include <unordered_map>
+#include <vector>
+
+#ifdef USE_ATEN_LIB
+#define ET_MERGED_DATA_MAP_NAMESPACE merged_data_map::aten
+#else // !USE_ATEN_LIB
+#define ET_MERGED_DATA_MAP_NAMESPACE merged_data_map
+#endif // USE_ATEN_LIB
+
+namespace executorch::extension {
+
+namespace ET_MERGED_DATA_MAP_NAMESPACE {
+/**
+ * A NamedDataMap implementation that wraps other NamedDataMaps.
+ */
+class MergedDataMap final
+    : public executorch::ET_RUNTIME_NAMESPACE::NamedDataMap {
+ public:
+  /**
+   * Creates a new NamedDataMap that takes in other data maps.
+   *
+   * @param[in] data_maps vector of NamedDataMap pointers to merge.
+   * Note: the data maps must outlive the MergedDataMap instance.
+   */
+  static executorch::runtime::Result<MergedDataMap>
+  load(executorch::runtime::Span<
+       const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*> named_data_maps);
+
+  /**
+   * Retrieve the tensor_layout for the specified key.
+   *
+   * @param[in] key The name of the tensor to get metadata on.
+   *
+   * @return Error::NotFound if the key is not present.
+   */
+  ET_NODISCARD
+  executorch::runtime::Result<
+      const executorch::ET_RUNTIME_NAMESPACE::TensorLayout>
+  get_tensor_layout(executorch::aten::string_view key) const override;
+
+  /**
+   * Retrieve read-only data for the specified key.
+   *
+   * @param[in] key The name of the tensor to get data on.
+   *
+   * @return error if the key is not present or data cannot be loaded.
+   */
+  ET_NODISCARD
+  executorch::runtime::Result<executorch::runtime::FreeableBuffer> get_data(
+      executorch::aten::string_view key) const override;
+
+  /**
+   * Loads the data of the specified tensor into the provided buffer.
+   *
+   * @param[in] key The name of the tensor to get the data of.
+   * @param[in] buffer The buffer to load data into. Must point to at least
+   * `size` bytes of memory.
+   * @param[in] size The number of bytes to load.
+   *
+   * @returns an Error indicating if the load was successful.
+   */
+  ET_NODISCARD executorch::runtime::Error load_data_into(
+      executorch::aten::string_view key,
+      void* buffer,
+      size_t size) const override;
+
+  /**
+   * @returns The number of keys in the map.
+   */
+  ET_NODISCARD executorch::runtime::Result<uint32_t> get_num_keys()
+      const override;
+  /**
+   * @returns The key at the specified index, error if index out of bounds.
+   */
+  ET_NODISCARD executorch::runtime::Result<const char*> get_key(
+      uint32_t index) const override;
+
+  MergedDataMap(MergedDataMap&&) noexcept = default;
+
+  ~MergedDataMap() override = default;
+
+ private:
+  MergedDataMap(
+      std::vector<const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*>
+          named_data_maps,
+      std::unordered_map<std::string, uint32_t> key_to_map_index)
+      : named_data_maps_(std::move(named_data_maps)),
+        key_to_map_index_(std::move(key_to_map_index)) {}
+
+  // Not copyable or assignable.
+  MergedDataMap(const MergedDataMap& rhs) = delete;
+  MergedDataMap& operator=(MergedDataMap&& rhs) noexcept = delete;
+  MergedDataMap& operator=(const MergedDataMap& rhs) = delete;
+
+  std::vector<const executorch::ET_RUNTIME_NAMESPACE::NamedDataMap*>
+      named_data_maps_;
+
+  // Map from key to index in the named_data_maps_ vector.
+  std::unordered_map<std::string, uint32_t> key_to_map_index_;
+};
+
+} // namespace ET_MERGED_DATA_MAP_NAMESPACE
+} // namespace executorch::extension
diff --git a/extension/named_data_map/targets.bzl b/extension/named_data_map/targets.bzl
new file mode 100644
index 00000000000..0c2b2fa6d5c
--- /dev/null
+++ b/extension/named_data_map/targets.bzl
@@ -0,0 +1,21 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_options", "runtime")
+
+def define_common_targets():
+    for aten_mode in get_aten_mode_options():
+        aten_suffix = "_aten" if aten_mode else ""
+        runtime.cxx_library(
+            name = "merged_data_map" + aten_suffix,
+            srcs = [
+                "merged_data_map.cpp",
+            ],
+            exported_headers = [
+                "merged_data_map.h",
+            ],
+            visibility = [
+                "@EXECUTORCH_CLIENTS",
+            ],
+            deps = [
+                "//executorch/runtime/core:named_data_map" + aten_suffix,
+                "//executorch/runtime/core:core",
+            ],
+        )
diff --git a/extension/named_data_map/test/CMakeLists.txt b/extension/named_data_map/test/CMakeLists.txt
new file mode 100644
index 00000000000..7fbcb7e5989
--- /dev/null
+++ b/extension/named_data_map/test/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+add_custom_command(
+  OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+         "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+  COMMAND
+    ${PYTHON_EXECUTABLE} -m test.models.export_program --modules
+    "ModuleAddMul,ModuleLinear" --external-constants --outdir
+    "${CMAKE_CURRENT_BINARY_DIR}"
+  WORKING_DIRECTORY ${EXECUTORCH_ROOT}
+)
+
+add_custom_target(
+  extension_named_data_map_test_resources
+  DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+          "${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+)
+
+set(test_env
+    "ET_MODULE_ADD_MUL_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleAddMulProgram.ptd"
+    "ET_MODULE_LINEAR_DATA_PATH=${CMAKE_CURRENT_BINARY_DIR}/ModuleLinearProgram.ptd"
+)
+
+set(_test_srcs merged_data_map_test.cpp)
+
+et_cxx_test(
+  extension_named_data_map_test
+  SOURCES
+  ${_test_srcs}
+  EXTRA_LIBS
+  extension_named_data_map
+  extension_flat_tensor
+  extension_data_loader
+)
+
+add_dependencies(
+  extension_named_data_map_test extension_named_data_map
+  extension_named_data_map_test_resources
+)
+set_property(
+  TEST extension_named_data_map_test PROPERTY ENVIRONMENT ${test_env}
+)
diff --git a/extension/named_data_map/test/TARGETS b/extension/named_data_map/test/TARGETS
new file mode 100644
index 00000000000..883ab644309
--- /dev/null
+++ b/extension/named_data_map/test/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets(is_fbcode=True)
diff --git a/extension/named_data_map/test/merged_data_map_test.cpp b/extension/named_data_map/test/merged_data_map_test.cpp
new file mode 100644
index 00000000000..ccfaaa0ec0e
--- /dev/null
+++ b/extension/named_data_map/test/merged_data_map_test.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
+#include <executorch/extension/named_data_map/merged_data_map.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/core/span.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+using namespace ::testing;
+using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
+using executorch::extension::merged_data_map::MergedDataMap;
+using executorch::runtime::DataLoader;
+using executorch::runtime::Error;
+using executorch::runtime::NamedDataMap;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::TensorLayout;
+
+class MergedDataMapTest : public ::testing::Test {
+ protected:
+  void load_flat_tensor_data_map(const char* path, const char* module_name) {
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loaders_.emplace(
+        module_name, std::make_unique<FileDataLoader>(std::move(loader.get())));
+
+    Result<FlatTensorDataMap> data_map =
+        FlatTensorDataMap::load(loaders_[module_name].get());
+    EXPECT_EQ(data_map.error(), Error::Ok);
+
+    data_maps_.emplace(
+        module_name,
+        std::make_unique<FlatTensorDataMap>(std::move(data_map.get())));
+  }
+
+  void SetUp() override {
+    // Since these tests cause ET_LOG to be called, the PAL must be initialized
+    // first.
+    executorch::runtime::runtime_init();
+
+    // Load FlatTensor data maps.
+    // The eager addmul and linear models are defined at:
+    // //executorch/test/models/export_program.py
+    load_flat_tensor_data_map(
+        std::getenv("ET_MODULE_ADD_MUL_DATA_PATH"), "addmul");
+    load_flat_tensor_data_map(
+        std::getenv("ET_MODULE_LINEAR_DATA_PATH"), "linear");
+  }
+
+ private:
+  // Must outlive data_maps_, but tests shouldn't need to touch it.
+  std::unordered_map<std::string, std::unique_ptr<FileDataLoader>> loaders_;
+
+ protected:
+  std::unordered_map<std::string, std::unique_ptr<NamedDataMap>> data_maps_;
+};
+
+// Check that two tensor layouts are equivalent.
+void check_tensor_layout(TensorLayout& layout1, TensorLayout& layout2) {
+  EXPECT_EQ(layout1.scalar_type(), layout2.scalar_type());
+  EXPECT_EQ(layout1.nbytes(), layout2.nbytes());
+  EXPECT_EQ(layout1.sizes().size(), layout2.sizes().size());
+  for (auto i : c10::irange(layout1.sizes().size())) {
+    EXPECT_EQ(layout1.sizes()[i], layout2.sizes()[i]);
+  }
+  EXPECT_EQ(layout1.dim_order().size(), layout2.dim_order().size());
+  for (auto i : c10::irange(layout1.dim_order().size())) {
+    EXPECT_EQ(layout1.dim_order()[i], layout2.dim_order()[i]);
+  }
+}
+
+// Given that ndm is part of merged, check that all the API calls on ndm produce
+// the same results as merged.
+void compare_ndm_api_calls(
+    const NamedDataMap* ndm,
+    const NamedDataMap* merged) {
+  uint32_t num_keys = ndm->get_num_keys().get();
+  for (auto i : c10::irange(num_keys)) {
+    auto key = ndm->get_key(i).get();
+
+    // Compare get_tensor_layout.
+    auto ndm_meta = ndm->get_tensor_layout(key).get();
+    auto merged_meta = merged->get_tensor_layout(key).get();
+    check_tensor_layout(ndm_meta, merged_meta);
+
+    // Compare get_data.
+    auto ndm_data = ndm->get_data(key);
+    auto merged_data = merged->get_data(key);
+    EXPECT_EQ(ndm_data.get().size(), merged_data.get().size());
+    for (auto j : c10::irange(ndm_meta.nbytes())) {
+      EXPECT_EQ(
+          ((uint8_t*)ndm_data.get().data())[j],
+          ((uint8_t*)merged_data.get().data())[j]);
+    }
+    ndm_data->Free();
+    merged_data->Free();
+
+    // Compare load_into.
+    auto nbytes = ndm_meta.nbytes();
+    auto ndm_buffer = std::make_unique<uint8_t[]>(nbytes);
+    auto ndm_load_into = ndm->load_data_into(key, ndm_buffer.get(), nbytes);
+    EXPECT_EQ(ndm_load_into, Error::Ok);
+    auto merged_buffer = std::make_unique<uint8_t[]>(nbytes);
+    auto merged_load_into =
+        merged->load_data_into(key, merged_buffer.get(), nbytes);
+    EXPECT_EQ(merged_load_into, Error::Ok);
+    for (auto j : c10::irange(ndm_meta.nbytes())) {
+      EXPECT_EQ(
+          ((uint8_t*)merged_buffer.get())[j],
+          ((uint8_t*)merged_buffer.get())[j]);
+    }
+  }
+}
+
+TEST_F(MergedDataMapTest, LoadNullDataMap) {
+  Result<MergedDataMap> merged_map = MergedDataMap::load({nullptr, nullptr});
+  EXPECT_EQ(merged_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(MergedDataMapTest, LoadSingleDataMap) {
+  std::vector<const NamedDataMap*> ndms = {data_maps_["addmul"].get(), nullptr};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::Ok);
+
+  // Num keys.
+  EXPECT_EQ(
+      merged_map->get_num_keys().get(),
+      data_maps_["addmul"]->get_num_keys().get());
+
+  // API calls produce equivalent results.
+  compare_ndm_api_calls(data_maps_["addmul"].get(), &merged_map.get());
+}
+
+TEST_F(MergedDataMapTest, LoadDuplicateDataMapsFail) {
+  std::vector<const NamedDataMap*> ndms = {
+      data_maps_["addmul"].get(), data_maps_["addmul"].get()};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::InvalidArgument);
+}
+
+TEST_F(MergedDataMapTest, CheckDataMapContents) {
+  std::vector<const NamedDataMap*> ndms = {
+      data_maps_["addmul"].get(), data_maps_["linear"].get()};
+  Result<MergedDataMap> merged_map =
+      MergedDataMap::load(Span<const NamedDataMap*>(ndms.data(), ndms.size()));
+  EXPECT_EQ(merged_map.error(), Error::Ok);
+
+  // Num keys.
+  size_t addmul_num_keys = data_maps_["addmul"]->get_num_keys().get();
+  size_t linear_num_keys = data_maps_["linear"]->get_num_keys().get();
+  EXPECT_EQ(
+      merged_map->get_num_keys().get(), addmul_num_keys + linear_num_keys);
+
+  // API calls produce equivalent results.
+  compare_ndm_api_calls(data_maps_["addmul"].get(), &merged_map.get());
+  compare_ndm_api_calls(data_maps_["linear"].get(), &merged_map.get());
+}
diff --git a/extension/named_data_map/test/targets.bzl b/extension/named_data_map/test/targets.bzl
new file mode 100644
index 00000000000..516abb8d45e
--- /dev/null
+++ b/extension/named_data_map/test/targets.bzl
@@ -0,0 +1,26 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets(is_fbcode=False):
+    if not runtime.is_oss and is_fbcode:
+        modules_env = {
+            # The tests use this var to find the program file to load. This uses
+            # an fbcode target path because the authoring/export tools
+            # intentionally don't work in xplat (since they're host-only tools).
+            "ET_MODULE_ADD_MUL_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleAddMul.ptd])",
+            "ET_MODULE_LINEAR_DATA_PATH": "$(location fbcode//executorch/test/models:exported_program_and_data[ModuleLinear.ptd])",
+        }
+
+        runtime.cxx_test(
+            name = "merged_data_map_test",
+            srcs = [
+                "merged_data_map_test.cpp",
+            ],
+            deps = [
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/extension/flat_tensor:flat_tensor_data_map",
+                "//executorch/extension/named_data_map:merged_data_map",
+                "//executorch/runtime/core:named_data_map",
+                "//executorch/runtime/core/exec_aten:lib",
+            ],
+            env = modules_env,
+        )
diff --git a/extension/pybindings/portable_lib.py b/extension/pybindings/portable_lib.py
index da65983cf02..0982d55b474 100644
--- a/extension/pybindings/portable_lib.py
+++ b/extension/pybindings/portable_lib.py
@@ -13,6 +13,9 @@
     This API is experimental and subject to change without notice.
 """
 
+import logging
+import os
+import sys
 import warnings as _warnings
 
 import executorch.exir._warnings as _exir_warnings
@@ -28,6 +31,21 @@
 # dependencies.
 import torch as _torch
 
+logger = logging.getLogger(__name__)
+
+# Update the DLL search path on Windows. This is the recommended way to handle native
+# extensions.
+if sys.platform == "win32":
+    try:
+        # The extension DLL should be in the same directory as this file.
+        pybindings_dir = os.path.dirname(os.path.abspath(__file__))
+        os.add_dll_directory(pybindings_dir)
+    except Exception as e:
+        logger.error(
+            "Failed to add the pybinding extension DLL to the search path. The extension may not work.",
+            e,
+        )
+
 # Let users import everything from the C++ _portable_lib extension as if this
 # python file defined them. Although we could import these dynamically, it
 # wouldn't preserve the static type annotations.
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
index a4a015cc879..c3cd4ed0b47 100644
--- a/extension/pybindings/pybindings.cpp
+++ b/extension/pybindings/pybindings.cpp
@@ -158,13 +158,45 @@ void setup_output_storage(
   }
 }
 
+inline std::unique_ptr<DataLoader> loader_from_buffer(
+    const void* ptr,
+    size_t ptr_len) {
+  return std::make_unique<BufferDataLoader>(ptr, ptr_len);
+}
+
+inline std::unique_ptr<DataLoader> loader_from_file(const std::string& path) {
+  Result<MmapDataLoader> res = MmapDataLoader::from(
+      path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
+  THROW_IF_ERROR(
+      res.error(),
+      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
+      path.c_str(),
+      static_cast<uint32_t>(res.error()));
+
+  return std::make_unique<MmapDataLoader>(std::move(res.get()));
+}
+
 inline std::unique_ptr<Module> load_module_from_buffer(
     const void* ptr,
     size_t ptr_len,
+    std::optional<const void*> data_map_ptr,
+    std::optional<size_t> data_map_len,
     std::unique_ptr<runtime::EventTracer> event_tracer,
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_buffer");
-  auto loader = std::make_unique<BufferDataLoader>(ptr, ptr_len);
+  auto loader = loader_from_buffer(ptr, ptr_len);
+
+  if (data_map_ptr.has_value() && data_map_len.has_value()) {
+    auto data_map_loader =
+        loader_from_buffer(data_map_ptr.value(), data_map_len.value());
+    return std::make_unique<Module>(
+        std::move(loader),
+        nullptr, // memory_allocator
+        nullptr, // temp_allocator
+        std::move(event_tracer), // event_tracer
+        std::move(data_map_loader)); // data_map_loader
+  }
+
   return std::make_unique<Module>(
       std::move(loader),
       nullptr, // memory_allocator
@@ -180,27 +212,9 @@ inline std::unique_ptr<Module> load_module_from_file(
     Program::Verification program_verification) {
   EXECUTORCH_SCOPE_PROF("load_module_from_file");
 
-  Result<MmapDataLoader> program_loader_res = MmapDataLoader::from(
-      program_path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-  THROW_IF_ERROR(
-      program_loader_res.error(),
-      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-      program_path.c_str(),
-      static_cast<uint32_t>(program_loader_res.error()));
-  auto program_loader =
-      std::make_unique<MmapDataLoader>(std::move(program_loader_res.get()));
-
+  auto program_loader = loader_from_file(program_path);
   if (data_map_path.has_value()) {
-    Result<MmapDataLoader> data_map_loader_res = MmapDataLoader::from(
-        data_map_path->c_str(),
-        MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-    THROW_IF_ERROR(
-        data_map_loader_res.error(),
-        "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-        data_map_path->c_str(),
-        static_cast<uint32_t>(data_map_loader_res.error()));
-    auto data_map_loader =
-        std::make_unique<MmapDataLoader>(std::move(data_map_loader_res.get()));
+    auto data_map_loader = loader_from_file(data_map_path.value());
     return std::make_unique<Module>(
         std::move(program_loader),
         nullptr, // memory_allocator
@@ -216,6 +230,22 @@ inline std::unique_ptr<Module> load_module_from_file(
       nullptr); // data_map_loader
 }
 
+inline std::unique_ptr<Module> load_module_from_buffer_with_data_file(
+    const void* ptr,
+    size_t ptr_len,
+    const std::string& data_map_path,
+    std::unique_ptr<runtime::EventTracer> event_tracer,
+    Program::Verification program_verification) {
+  auto program_loader = loader_from_buffer(ptr, ptr_len);
+  auto data_loader = loader_from_file(data_map_path);
+  return std::make_unique<Module>(
+      std::move(program_loader),
+      nullptr, // memory_allocator
+      nullptr, // temp_allocator
+      std::move(event_tracer), // event_tracer
+      std::move(data_loader));
+}
+
 inline py::list get_outputs_as_py_list(
     const std::vector<EValue>& outputs,
     bool clone_outputs = true) {
@@ -504,6 +534,7 @@ struct PyMethodMeta final {
 struct PyModule final {
   explicit PyModule(
       const py::bytes& buffer,
+      std::optional<const py::bytes> data_map_buffer,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
@@ -512,12 +543,21 @@ struct PyModule final {
         module_(load_module_from_buffer(
             buffer.cast<std::string_view>().data(),
             py::len(buffer),
+            data_map_buffer.has_value()
+                ? std::optional<const void*>(
+                      data_map_buffer.value().cast<std::string_view>().data())
+                : std::nullopt,
+            data_map_buffer.has_value()
+                ? std::optional<size_t>(py::len(data_map_buffer.value()))
+                : std::nullopt,
             setup_event_tracer(enable_etdump, debug_buffer_size),
             program_verification)) {}
 
   explicit PyModule(
       const void* ptr,
       size_t ptr_len,
+      std::optional<const void*> data_map_ptr,
+      std::optional<size_t> data_map_ptr_len,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
@@ -526,6 +566,24 @@ struct PyModule final {
         module_(load_module_from_buffer(
             ptr,
             ptr_len,
+            data_map_ptr,
+            data_map_ptr_len,
+            setup_event_tracer(enable_etdump, debug_buffer_size),
+            program_verification)) {}
+
+  explicit PyModule(
+      const void* ptr,
+      size_t ptr_len,
+      const std::string& data_path,
+      bool enable_etdump,
+      size_t debug_buffer_size = 0,
+      Program::Verification program_verification =
+          Program::Verification::InternalConsistency)
+      : debug_buffer_size_(debug_buffer_size),
+        module_(load_module_from_buffer_with_data_file(
+            ptr,
+            ptr_len,
+            data_path,
             setup_event_tracer(enable_etdump, debug_buffer_size),
             program_verification)) {}
 
@@ -551,12 +609,17 @@ struct PyModule final {
   // Module is only valid as long as the python buffer is alive.
   static std::unique_ptr<PyModule> load_from_buffer(
       const py::bytes& buffer,
+      std::optional<const py::bytes> data_map_buffer,
       bool enable_etdump,
       size_t debug_buffer_size = 0,
       Program::Verification program_verification =
           Program::Verification::InternalConsistency) {
     return std::make_unique<PyModule>(
-        buffer, enable_etdump, debug_buffer_size, program_verification);
+        buffer,
+        data_map_buffer,
+        enable_etdump,
+        debug_buffer_size,
+        program_verification);
   }
 
   static std::unique_ptr<PyModule> load_from_file(
@@ -574,15 +637,43 @@ struct PyModule final {
         program_verification);
   }
 
+  // Load with data as a buffer.
   static std::unique_ptr<PyModule> load_from_bundled_program(
       PyBundledModule& m,
+      std::optional<const py::bytes> data_map_buffer,
       bool enable_etdump,
       size_t debug_buffer_size = 0) {
+    std::optional<const void*> data_map_ptr = std::nullopt;
+    std::optional<size_t> data_map_len = std::nullopt;
+
+    if (data_map_buffer.has_value()) {
+      data_map_ptr = data_map_buffer.value().cast<std::string_view>().data();
+      data_map_len = py::len(data_map_buffer.value());
+    }
+
     return std::make_unique<PyModule>(
         m.get_program_ptr(),
         m.get_program_len(),
+        data_map_ptr,
+        data_map_len,
+        enable_etdump,
+        debug_buffer_size,
+        Program::Verification::InternalConsistency);
+  }
+
+  // Load with data as a file.
+  static std::unique_ptr<PyModule> load_from_bundled_program(
+      PyBundledModule& m,
+      const std::string& data_path,
+      bool enable_etdump,
+      size_t debug_buffer_size = 0) {
+    return std::make_unique<PyModule>(
+        m.get_program_ptr(),
+        m.get_program_len(),
+        data_path,
         enable_etdump,
-        debug_buffer_size);
+        debug_buffer_size,
+        Program::Verification::InternalConsistency);
   }
 
   py::list run_method(
@@ -857,24 +948,6 @@ struct PyModule final {
   }
 };
 
-inline std::unique_ptr<DataLoader> loader_from_buffer(
-    const void* ptr,
-    size_t ptr_len) {
-  return std::make_unique<BufferDataLoader>(ptr, ptr_len);
-}
-
-inline std::unique_ptr<DataLoader> loader_from_file(const std::string& path) {
-  Result<MmapDataLoader> res = MmapDataLoader::from(
-      path.c_str(), MmapDataLoader::MlockConfig::UseMlockIgnoreErrors);
-  THROW_IF_ERROR(
-      res.error(),
-      "Failed to create MmapDataLoader from file %s, error: 0x:%" PRIx32,
-      path.c_str(),
-      static_cast<uint32_t>(res.error()));
-
-  return std::make_unique<MmapDataLoader>(std::move(res.get()));
-}
-
 inline std::shared_ptr<ProgramState> load_program(
     std::unique_ptr<DataLoader> loader,
     Program::Verification program_verification) {
@@ -1423,6 +1496,7 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
       "_load_for_executorch_from_buffer",
       &PyModule::load_from_buffer,
       py::arg("buffer"),
+      py::arg("data_map_buffer") = std::nullopt,
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       py::arg("program_verification") =
@@ -1430,8 +1504,22 @@ PYBIND11_MODULE(EXECUTORCH_PYTHON_MODULE_NAME, m) {
       call_guard);
   m.def(
       "_load_for_executorch_from_bundled_program",
-      &PyModule::load_from_bundled_program,
+      py::overload_cast<
+          PyBundledModule&,
+          std::optional<const py::bytes>,
+          bool,
+          size_t>(&PyModule::load_from_bundled_program),
+      py::arg("ptr"),
+      py::arg("data_map_buffer") = std::nullopt,
+      py::arg("enable_etdump") = false,
+      py::arg("debug_buffer_size") = 0,
+      call_guard);
+  m.def(
+      "_load_for_executorch_from_bundled_program",
+      py::overload_cast<PyBundledModule&, const std::string&, bool, size_t>(
+          &PyModule::load_from_bundled_program),
       py::arg("ptr"),
+      py::arg("data_path"),
       py::arg("enable_etdump") = false,
       py::arg("debug_buffer_size") = 0,
       call_guard);
diff --git a/extension/pybindings/pybindings.pyi b/extension/pybindings/pybindings.pyi
index 27e523eb4d7..a3b75780369 100644
--- a/extension/pybindings/pybindings.pyi
+++ b/extension/pybindings/pybindings.pyi
@@ -185,6 +185,7 @@ def _load_for_executorch(
 @experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch_from_buffer(
     buffer: bytes,
+    data_map_buffer: Optional[bytes] = None,
     enable_etdump: bool = False,
     debug_buffer_size: int = 0,
     program_verification: Verification = Verification.InternalConsistency,
@@ -199,7 +200,10 @@ def _load_for_executorch_from_buffer(
 
 @experimental("This API is experimental and subject to change without notice.")
 def _load_for_executorch_from_bundled_program(
-    module: BundledModule, enable_etdump: bool = False, debug_buffer_size: int = 0
+    module: BundledModule,
+    data_map_buffer: Optional[bytes] = None,
+    enable_etdump: bool = False,
+    debug_buffer_size: int = 0,
 ) -> ExecuTorchModule:
     """Same as _load_for_executorch, but takes a bundled program instead of a file path.
 
diff --git a/extension/pybindings/test/TARGETS b/extension/pybindings/test/TARGETS
index e368e7c2404..c6a77c9d64e 100644
--- a/extension/pybindings/test/TARGETS
+++ b/extension/pybindings/test/TARGETS
@@ -17,6 +17,9 @@ runtime.python_library(
     deps = [
         "//caffe2:torch",
         "//caffe2:torch_fx",
+        "//executorch/devtools/bundled_program:config",
+        "//executorch/devtools/bundled_program:core",
+        "//executorch/devtools/bundled_program/serialize:lib",
         "//executorch/exir:lib",
         "//executorch/exir:pass_manager",
         "//executorch/exir:scalar_type",
diff --git a/extension/pybindings/test/test_pybindings.py b/extension/pybindings/test/test_pybindings.py
index 12aec38cec6..ec45428c7d7 100644
--- a/extension/pybindings/test/test_pybindings.py
+++ b/extension/pybindings/test/test_pybindings.py
@@ -635,6 +635,9 @@ def test_program_data_separation(self) -> None:
                 external_constants=True,
             )
         )
+        program_buffer = exec_program.buffer
+        assert len(exec_program._tensor_data) == 1
+        data_buffer = bytes(exec_program._tensor_data.pop("_default_external_constant"))
 
         import os
         import tempfile
@@ -642,17 +645,91 @@ def test_program_data_separation(self) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             pte_file = os.path.join(tmpdir, "linear.pte")
             with open(pte_file, "wb") as f:
-                f.write(exec_program.buffer)
+                f.write(program_buffer)
+            ptd_file = os.path.join(tmpdir, "linear.ptd")
+            with open(ptd_file, "wb") as ptd:
+                ptd.write(data_buffer)
+            expected = eager_module(inputs[0])
+            # Test 1: File-based loading with external data file
+            executorch_module_file = self.runtime._load_for_executorch(
+                pte_file, ptd_file
+            )
+            executorch_output_file = executorch_module_file.forward(inputs)[0]
+            self.assertTrue(torch.allclose(expected, executorch_output_file))
+
+        # Test 2: Buffer-based loading with external data buffer
+        executorch_module_buffer = self.load_fn(program_buffer, data_buffer)
+        executorch_output_buffer = executorch_module_buffer.forward(inputs)[0]
+        self.assertTrue(torch.allclose(expected, executorch_output_buffer))
+
+        # Test 3: Buffer-based loading without external data file (should fail or work differently)
+        # This should fail because the program expects external data
+        executorch_module_no_data = self.load_fn(program_buffer)
+        with self.assertRaises(RuntimeError):
+            executorch_module_no_data.forward(inputs)
+
+        # Test 4: Test with invalid data buffer (should fail)
+        invalid_bytes = b"invalid bytes"
+        executorch_module_invalid_data = self.load_fn(program_buffer, invalid_bytes)
+        with self.assertRaises(RuntimeError):
+            executorch_module_invalid_data.forward(inputs)
+
+        # Test 5: Test bundled program loading with external data
+        # First create a bundled program with external constants
+        from executorch.devtools.bundled_program.config import (
+            MethodTestCase,
+            MethodTestSuite,
+        )
+        from executorch.devtools.bundled_program.core import BundledProgram
+        from executorch.devtools.bundled_program.serialize import (
+            serialize_from_bundled_program_to_flatbuffer,
+        )
 
+        method_test_suites = [
+            MethodTestSuite(
+                method_name="forward",
+                test_cases=[
+                    MethodTestCase(
+                        inputs=input,
+                        expected_outputs=expected,
+                    )
+                    for input in inputs
+                ],
+            ),
+        ]
+        bundled_program = BundledProgram(exec_program, method_test_suites)
+        bundled_buffer = serialize_from_bundled_program_to_flatbuffer(bundled_program)
+        bundled_module = self.runtime._load_bundled_program_from_buffer(bundled_buffer)
+
+        # Load module from bundled program with external data buffer
+        executorch_module_bundled = (
+            self.runtime._load_for_executorch_from_bundled_program(
+                bundled_module, data_buffer
+            )
+        )
+        executorch_output_bundled = executorch_module_bundled.forward(inputs)[0]
+        self.assertTrue(torch.allclose(expected, executorch_output_bundled))
+
+        # Load module from bundled program with external data file
+        with tempfile.TemporaryDirectory() as tmpdir:
             ptd_file = os.path.join(tmpdir, "linear.ptd")
             with open(ptd_file, "wb") as ptd:
-                tensor_data = bytes(
-                    exec_program._tensor_data.pop("_default_external_constant")
+                ptd.write(data_buffer)
+            executorch_module_bundled_data_file = (
+                self.runtime._load_for_executorch_from_bundled_program(
+                    bundled_module, ptd_file
                 )
-                ptd.write(tensor_data)
-
-            executorch_program = self.runtime._load_for_executorch(pte_file, ptd_file)
+            )
+            executorch_output_bundled_data_file = (
+                executorch_module_bundled_data_file.forward(inputs)[0]
+            )
+            self.assertTrue(
+                torch.allclose(expected, executorch_output_bundled_data_file)
+            )
 
-            expected = eager_module(inputs[0])
-            executorch_output = executorch_program.forward(inputs)[0]
-            self.assertTrue(torch.allclose(expected, executorch_output))
+        # Test 6: Bundled program without external data should fail
+        executorch_module_bundled_no_data = (
+            self.runtime._load_for_executorch_from_bundled_program(bundled_module)
+        )
+        with self.assertRaises(RuntimeError):
+            executorch_module_bundled_no_data.forward(inputs)
diff --git a/extension/pytree/TARGETS b/extension/pytree/TARGETS
index 5f09772ee38..79c4d2b8b3f 100644
--- a/extension/pytree/TARGETS
+++ b/extension/pytree/TARGETS
@@ -15,9 +15,11 @@ runtime.cxx_python_extension(
     ],
     base_module = "executorch.extension.pytree",
     deps = [
-        "fbsource//third-party/pybind11:pybind11",
         ":pytree",
     ],
+    external_deps = [
+        "pybind11",
+    ],
 )
 
 runtime.cxx_python_extension(
@@ -27,9 +29,11 @@ runtime.cxx_python_extension(
     ],
     base_module = "executorch.extension.pytree",
     deps = [
-        "fbsource//third-party/pybind11:pybind11",
         ":pytree",
     ],
+    external_deps = [
+        "pybind11",
+    ],
 )
 
 runtime.python_library(
diff --git a/extension/runner_util/CMakeLists.txt b/extension/runner_util/CMakeLists.txt
index 1a9721c3920..75fa11c0493 100644
--- a/extension/runner_util/CMakeLists.txt
+++ b/extension/runner_util/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -28,7 +29,7 @@ target_compile_options(extension_runner_util PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_runner_util
   EXPORT ExecuTorchTargets
-  DESTINATION ${CMAKE_BINARY_DIR}/lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/runner_util/inputs.cpp b/extension/runner_util/inputs.cpp
index eceaf3cfeca..c1112489afb 100644
--- a/extension/runner_util/inputs.cpp
+++ b/extension/runner_util/inputs.cpp
@@ -78,7 +78,40 @@ Result<BufferCleanup> prepare_input_tensors(
       continue;
     }
     if (tag.get() != Tag::Tensor) {
-      ET_LOG(Debug, "Skipping non-tensor input %zu", i);
+      if (!hard_code_inputs_to_ones) {
+        Error err = Error::Ok;
+        auto [buffer, buffer_size] = input_buffers.at(i);
+
+        ET_LOG(
+            Debug, "Verifying and setting input for non-tensor input %zu", i);
+
+        if (tag.get() == Tag::Int) {
+          int64_t int_input;
+          std::memcpy(&int_input, buffer, buffer_size);
+          err = method.set_input(runtime::EValue(int_input), i);
+        } else if (tag.get() == Tag::Double) {
+          double double_input;
+          std::memcpy(&double_input, buffer, buffer_size);
+          err = method.set_input(runtime::EValue(double_input), i);
+        } else if (tag.get() == Tag::Bool) {
+          bool bool_input;
+          std::memcpy(&bool_input, buffer, buffer_size);
+          err = method.set_input(runtime::EValue(bool_input), i);
+        } else {
+          ET_LOG(
+              Error,
+              "Input %zu of type %zu not supported",
+              i,
+              static_cast<size_t>(tag.get()));
+          err = Error::InvalidArgument;
+        }
+        if (err != Error::Ok) {
+          BufferCleanup cleanup({inputs, num_allocated});
+          return err;
+        }
+      } else {
+        ET_LOG(Debug, "Skipping non-tensor input %zu", i);
+      }
       continue;
     }
     Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(i);
diff --git a/extension/tensor/CMakeLists.txt b/extension/tensor/CMakeLists.txt
index 0e409c3bfb3..2a8d9b17916 100644
--- a/extension/tensor/CMakeLists.txt
+++ b/extension/tensor/CMakeLists.txt
@@ -28,7 +28,7 @@ target_compile_options(extension_tensor PUBLIC ${_common_compile_options})
 install(
   TARGETS extension_tensor
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/tensor/tensor_ptr.h b/extension/tensor/tensor_ptr.h
index 4753ec296da..d8fad857cd2 100644
--- a/extension/tensor/tensor_ptr.h
+++ b/extension/tensor/tensor_ptr.h
@@ -123,13 +123,14 @@ inline TensorPtr make_tensor_ptr(
       }
     } ctx;
 
-    ET_SWITCH_REALHBBF16_TYPES(type, ctx, "make_tensor_ptr", CTYPE, [&] {
-      std::transform(
-          data.begin(),
-          data.end(),
-          reinterpret_cast<CTYPE*>(casted_data.data()),
-          [](const T& val) { return static_cast<CTYPE>(val); });
-    });
+    ET_SWITCH_REALHBBF16_AND_UINT_TYPES(
+        type, ctx, "make_tensor_ptr", CTYPE, [&] {
+          std::transform(
+              data.begin(),
+              data.end(),
+              reinterpret_cast<CTYPE*>(casted_data.data()),
+              [](const T& val) { return static_cast<CTYPE>(val); });
+        });
     const auto raw_data_ptr = casted_data.data();
     auto data_ptr =
         std::make_shared<std::vector<uint8_t>>(std::move(casted_data));
@@ -272,7 +273,8 @@ inline TensorPtr make_tensor_ptr(
  */
 template <typename T>
 inline TensorPtr make_tensor_ptr(T value) {
-  return make_tensor_ptr({}, std::vector<T>{value});
+  return make_tensor_ptr(
+      std::vector<executorch::aten::SizesType>{}, std::vector<T>{value});
 }
 
 /**
@@ -323,32 +325,87 @@ inline TensorPtr make_tensor_ptr(
 }
 
 /**
- * Creates a TensorPtr to manage a new Tensor with the same properties
- * as the given Tensor, sharing the same data without owning it.
+ * Creates a TensorPtr to manage a new Tensor that aliases the given Tensor's
+ * storage, with optional metadata overrides. Shape dynamism is inherited from
+ * the source tensor.
+ *
+ * If an override is provided (non-empty), it is passed as-is. If an override is
+ * empty, the corresponding metadata is reused from the source tensor when it
+ * fits; otherwise it is left empty for the core factory to derive a valid
+ * configuration. If `dim_order` is empty but `strides` is provided, `dim_order`
+ * is left empty so the core may infer it from the provided strides.
  *
- * @param tensor The Tensor whose properties are used to create a new TensorPtr.
- * @return A new TensorPtr managing a Tensor with the same properties as the
- * original.
+ * @param tensor The source tensor to alias.
+ * @param sizes Optional sizes override.
+ * @param dim_order Optional dimension order override.
+ * @param strides Optional strides override.
+ * @param deleter A custom deleter function for managing the lifetime of the
+ * original Tensor.
+ * @return A TensorPtr aliasing the same storage with requested metadata.
  */
-inline TensorPtr make_tensor_ptr(const executorch::aten::Tensor& tensor) {
+inline TensorPtr make_tensor_ptr(
+    const executorch::aten::Tensor& tensor,
+    std::vector<executorch::aten::SizesType> sizes = {},
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {},
+    std::function<void(void*)> deleter = nullptr) {
+  if (sizes.empty()) {
+    sizes.assign(tensor.sizes().begin(), tensor.sizes().end());
+  }
+  const auto same_rank = sizes.size() == static_cast<size_t>(tensor.dim());
+  const auto same_shape = same_rank &&
+      std::equal(sizes.begin(), sizes.end(), tensor.sizes().begin());
+  const auto element_count =
+      executorch::aten::compute_numel(sizes.data(), sizes.size());
+  const auto parent_element_count = tensor.numel();
+  ET_CHECK_MSG(
+      element_count <= parent_element_count,
+      "Requested view has %zd elements, but source tensor only has %zd.",
+      static_cast<ssize_t>(element_count),
+      static_cast<ssize_t>(parent_element_count));
+#ifndef USE_ATEN_LIB
+  if (dim_order.empty() && strides.empty() && same_rank) {
+    dim_order.assign(tensor.dim_order().begin(), tensor.dim_order().end());
+  }
+#endif // USE_ATEN_LIB
+  if (strides.empty() && dim_order.empty() && same_shape) {
+    strides.assign(tensor.strides().begin(), tensor.strides().end());
+  }
   return make_tensor_ptr(
-      std::vector<executorch::aten::SizesType>(
-          tensor.sizes().begin(), tensor.sizes().end()),
+      std::move(sizes),
       tensor.mutable_data_ptr(),
-#ifndef USE_ATEN_LIB
-      std::vector<executorch::aten::DimOrderType>(
-          tensor.dim_order().begin(), tensor.dim_order().end()),
-      std::vector<executorch::aten::StridesType>(
-          tensor.strides().begin(), tensor.strides().end()),
+      std::move(dim_order),
+      std::move(strides),
       tensor.scalar_type(),
-      tensor.shape_dynamism()
+#ifndef USE_ATEN_LIB
+      tensor.shape_dynamism(),
 #else // USE_ATEN_LIB
-      {},
-      std::vector<executorch::aten::StridesType>(
-          tensor.strides().begin(), tensor.strides().end()),
-      tensor.scalar_type()
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
 #endif // USE_ATEN_LIB
-  );
+      std::move(deleter));
+}
+
+/**
+ * Convenience overload identical to make_tensor_ptr(*tensor_ptr, ...).
+ * Keeps the original TensorPtr alive until the returned TensorPtr is destroyed.
+ *
+ * @param tensor_ptr The source tensor pointer to alias.
+ * @param sizes Optional sizes override.
+ * @param dim_order Optional dimension order override.
+ * @param strides Optional strides override.
+ * @return A TensorPtr aliasing the same storage with requested metadata.
+ */
+inline TensorPtr make_tensor_ptr(
+    const TensorPtr& tensor_ptr,
+    std::vector<executorch::aten::SizesType> sizes = {},
+    std::vector<executorch::aten::DimOrderType> dim_order = {},
+    std::vector<executorch::aten::StridesType> strides = {}) {
+  return make_tensor_ptr(
+      *tensor_ptr,
+      std::move(sizes),
+      std::move(dim_order),
+      std::move(strides),
+      [tensor_ptr](void*) {});
 }
 
 /**
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
index 511b0ebe582..b71dfab8eeb 100644
--- a/extension/tensor/tensor_ptr_maker.cpp
+++ b/extension/tensor/tensor_ptr_maker.cpp
@@ -96,7 +96,7 @@ TensorPtr random_strided(
     }
   } ctx;
 
-  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "random_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(type, ctx, "random_strided", CTYPE, [&] {
     std::generate_n(tensor->mutable_data_ptr<CTYPE>(), tensor->numel(), [&]() {
       return static_cast<CTYPE>(distribution(gen));
     });
@@ -138,7 +138,7 @@ TensorPtr full_strided(
     }
   } ctx;
 
-  ET_SWITCH_REALHBBF16_TYPES(type, ctx, "full_strided", CTYPE, [&] {
+  ET_SWITCH_REALHBBF16_AND_UINT_TYPES(type, ctx, "full_strided", CTYPE, [&] {
     CTYPE value;
     ET_EXTRACT_SCALAR(fill_value, value);
     std::fill(
diff --git a/extension/tensor/test/tensor_ptr_test.cpp b/extension/tensor/test/tensor_ptr_test.cpp
index 6c98db52d41..5e242e5eb02 100644
--- a/extension/tensor/test/tensor_ptr_test.cpp
+++ b/extension/tensor/test/tensor_ptr_test.cpp
@@ -347,7 +347,7 @@ TEST_F(TensorPtrTest, TensorSharingImplResizingAffectsBothVector) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -357,10 +357,208 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
+TEST_F(TensorPtrTest, MakeViewOverrideSizesRankIncrease) {
+  std::vector<float> data = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr({2, 3}, std::move(data));
+  auto view = make_tensor_ptr(tensor, {1, 2, 3});
+
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 1);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->size(2), 3);
+  EXPECT_EQ(view->const_data_ptr<float>(), tensor->const_data_ptr<float>());
+  EXPECT_EQ(view->strides()[0], 6);
+  EXPECT_EQ(view->strides()[1], 3);
+  EXPECT_EQ(view->strides()[2], 1);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideSizesSameRankRecomputesStrides) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {4, 3});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 4);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->strides()[0], 3);
+  EXPECT_EQ(view->strides()[1], 1);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideDimOrderOnly) {
+  float data[6] = {0};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  auto view = make_tensor_ptr(tensor, {}, {1, 0}, {});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 2);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 2);
+}
+
+TEST_F(TensorPtrTest, MakeViewOverrideStridesOnlyInfersDimOrder) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {}, {1, 3});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewReuseMetadataWhenShapeSame) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data, {1, 0}, {1, 3});
+  auto view = make_tensor_ptr(tensor, {3, 4});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewShapeChangeWithExplicitOldStridesExpectDeath) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  std::vector<executorch::aten::StridesType> old_strides(
+      tensor->strides().begin(), tensor->strides().end());
+
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr(tensor, {2, 6}, {}, old_strides); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewInvalidDimOrderExpectDeath) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+
+  ET_EXPECT_DEATH(
+      { auto _ = make_tensor_ptr(tensor, {3, 4}, {2, 1}, {1, 4}); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorPtrConvenienceOverload) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr({3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {1, 0}, {});
+
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 4);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, MakeViewRankDecreaseFlatten) {
+  float data[6] = {1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  auto view = make_tensor_ptr(tensor, {6});
+  EXPECT_EQ(view->dim(), 1);
+  EXPECT_EQ(view->size(0), 6);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_NE(tensor->unsafeGetTensorImpl(), view->unsafeGetTensorImpl());
+  EXPECT_EQ(resize_tensor_ptr(view, {3, 2}), Error::NotSupported);
+  EXPECT_EQ(view->dim(), 1);
+  EXPECT_EQ(view->size(0), 6);
+}
+
+TEST_F(TensorPtrTest, MakeViewFromScalarAliasAnd1D) {
+  float scalar_value = 7.f;
+  auto tensor = make_tensor_ptr({}, &scalar_value);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(alias->dim(), 0);
+  EXPECT_EQ(alias->numel(), 1);
+  auto reshaped = make_tensor_ptr(tensor, {1});
+  EXPECT_EQ(reshaped->dim(), 1);
+  EXPECT_EQ(reshaped->size(0), 1);
+  EXPECT_EQ(reshaped->strides()[0], 1);
+  ET_EXPECT_DEATH({ auto unused = make_tensor_ptr(tensor, {}, {0}, {}); }, "");
+  ET_EXPECT_DEATH({ auto unused = make_tensor_ptr(tensor, {}, {}, {1}); }, "");
+}
+
+TEST_F(TensorPtrTest, MakeViewExplicitDimOrderAndStridesShapeChange) {
+  float data[6] = {0};
+  auto tensor = make_tensor_ptr({2, 3}, data);
+  auto view = make_tensor_ptr(tensor, {3, 2}, {1, 0}, {1, 3});
+  EXPECT_EQ(view->dim(), 2);
+  EXPECT_EQ(view->size(0), 3);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->strides()[0], 1);
+  EXPECT_EQ(view->strides()[1], 3);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataInt16Type) {
+  std::vector<int16_t> int16_values = {-1, 2, -3, 4};
+  auto byte_pointer = reinterpret_cast<const uint8_t*>(int16_values.data());
+  std::vector<uint8_t> byte_data(
+      byte_pointer, byte_pointer + int16_values.size() * sizeof(int16_t));
+  auto tensor = make_tensor_ptr(
+      {4}, std::move(byte_data), executorch::aten::ScalarType::Short);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  auto int16_data = tensor->const_data_ptr<int16_t>();
+  EXPECT_EQ(int16_data[0], -1);
+  EXPECT_EQ(int16_data[1], 2);
+  EXPECT_EQ(int16_data[2], -3);
+  EXPECT_EQ(int16_data[3], 4);
+}
+
+TEST_F(TensorPtrTest, MakeView3DDimOrderOnly) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data);
+  auto view = make_tensor_ptr(tensor, {}, {2, 0, 1}, {});
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 2);
+  EXPECT_EQ(view->size(1), 3);
+  EXPECT_EQ(view->size(2), 4);
+  EXPECT_EQ(view->strides()[0], 3);
+  EXPECT_EQ(view->strides()[1], 1);
+  EXPECT_EQ(view->strides()[2], 6);
+}
+
+#ifndef USE_ATEN_LIB
+TEST_F(TensorPtrTest, MakeViewDynamismPropagationResizeAlias) {
+  float data[12] = {0};
+  auto tensor = make_tensor_ptr(
+      {3, 4},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_UNBOUND);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(resize_tensor_ptr(alias, {2, 6}), Error::Ok);
+  EXPECT_EQ(alias->size(0), 2);
+  EXPECT_EQ(alias->size(1), 6);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->size(1), 4);
+}
+
+TEST_F(TensorPtrTest, MakeViewSameRankShapeChangeCopiesDimOrder) {
+  float data[24] = {0};
+  auto tensor = make_tensor_ptr({2, 3, 4}, data, {2, 0, 1}, {3, 1, 6});
+  auto view = make_tensor_ptr(tensor, {4, 2, 3});
+  EXPECT_EQ(view->dim(), 3);
+  EXPECT_EQ(view->size(0), 4);
+  EXPECT_EQ(view->size(1), 2);
+  EXPECT_EQ(view->size(2), 3);
+  EXPECT_EQ(view->strides()[0], 2);
+  EXPECT_EQ(view->strides()[1], 1);
+  EXPECT_EQ(view->strides()[2], 8);
+}
+#endif
+
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -373,6 +571,56 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt32) {
   EXPECT_EQ(cloned_tensor->scalar_type(), executorch::aten::ScalarType::Int);
 }
 
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrInt32) {
+  std::vector<int32_t> data = {1, 2, 3, 4};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int32_t>(), tensor->const_data_ptr<int32_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Int);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrDouble) {
+  std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<double>(), tensor->const_data_ptr<double>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Double);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrInt64) {
+  std::vector<int64_t> data = {100, 200, 300, 400};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(
+      new_tensor->const_data_ptr<int64_t>(), tensor->const_data_ptr<int64_t>());
+  EXPECT_EQ(new_tensor->scalar_type(), executorch::aten::ScalarType::Long);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromTensorPtrNull) {
+  auto tensor = make_tensor_ptr({2, 2}, nullptr);
+  auto new_tensor = make_tensor_ptr(tensor);
+
+  EXPECT_EQ(new_tensor->dim(), tensor->dim());
+  EXPECT_EQ(new_tensor->size(0), tensor->size(0));
+  EXPECT_EQ(new_tensor->size(1), tensor->size(1));
+  EXPECT_EQ(new_tensor->const_data_ptr(), tensor->const_data_ptr());
+  EXPECT_EQ(new_tensor->const_data_ptr(), nullptr);
+}
+
 TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
   std::vector<int32_t> data = {1, 2, 3, 4};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
@@ -392,7 +640,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrInt32) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -405,7 +653,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorDouble) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorDouble) {
   std::vector<double> data = {1.0, 2.0, 3.0, 4.0};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -437,7 +685,7 @@ TEST_F(TensorPtrTest, CloneTensorPtrFromTensorPtrDouble) {
 TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr({2, 2}, data);
-  auto new_tensor = make_tensor_ptr(*tensor);
+  auto new_tensor = make_tensor_ptr(tensor);
 
   EXPECT_EQ(new_tensor->dim(), tensor->dim());
   EXPECT_EQ(new_tensor->size(0), tensor->size(0));
@@ -450,7 +698,7 @@ TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorInt64) {
 TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorInt64) {
   std::vector<int64_t> data = {100, 200, 300, 400};
   auto tensor = make_tensor_ptr({2, 2}, std::move(data));
-  auto cloned_tensor = clone_tensor_ptr(*tensor);
+  auto cloned_tensor = clone_tensor_ptr(tensor);
 
   EXPECT_EQ(cloned_tensor->dim(), tensor->dim());
   EXPECT_EQ(cloned_tensor->size(0), tensor->size(0));
@@ -753,7 +1001,7 @@ TEST_F(TensorPtrTest, TensorDeducedScalarType) {
   EXPECT_EQ(tensor->const_data_ptr<double>()[3], 4.0);
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
+TEST_F(TensorPtrTest, TensorUint8dataWithFloatScalarType) {
   std::vector<uint8_t> data(
       4 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
 
@@ -777,19 +1025,87 @@ TEST_F(TensorPtrTest, TensorUint8BufferWithFloatScalarType) {
   EXPECT_EQ(tensor->const_data_ptr<float>()[3], 4.0f);
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferTooSmallExpectDeath) {
+TEST_F(TensorPtrTest, TensorUint8dataTooSmallExpectDeath) {
   std::vector<uint8_t> data(
       2 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH(
       { auto tensor = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
-TEST_F(TensorPtrTest, TensorUint8BufferTooLargeExpectDeath) {
+TEST_F(TensorPtrTest, TensorUint8dataTooLargeExpectDeath) {
   std::vector<uint8_t> data(
       5 * executorch::aten::elementSize(executorch::aten::ScalarType::Float));
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 2}, std::move(data)); }, "");
 }
 
+TEST_F(TensorPtrTest, MakeViewFromTensorPtrKeepsSourceAlive) {
+  bool freed = false;
+  auto* data = new float[6]{1, 2, 3, 4, 5, 6};
+  auto tensor = make_tensor_ptr(
+      {2, 3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(tensor);
+  tensor.reset();
+  EXPECT_FALSE(freed);
+  EXPECT_EQ(view->const_data_ptr<float>()[0], 1.0f);
+  view->mutable_data_ptr<float>()[0] = 42.0f;
+  EXPECT_EQ(view->const_data_ptr<float>()[0], 42.0f);
+  view.reset();
+  EXPECT_TRUE(freed);
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorDoesNotKeepAliveByDefault) {
+  bool freed = false;
+  auto* data = new float[2]{7.0f, 8.0f};
+  auto tensor = make_tensor_ptr(
+      {2, 1},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(*tensor);
+  auto raw = view->const_data_ptr<float>();
+  EXPECT_EQ(raw, data);
+  tensor.reset();
+  EXPECT_TRUE(freed);
+  view.reset();
+}
+
+TEST_F(TensorPtrTest, MakeViewFromTensorWithDeleterKeepsAlive) {
+  bool freed = false;
+  auto* data = new float[3]{1.0f, 2.0f, 3.0f};
+  auto tensor = make_tensor_ptr(
+      {3},
+      data,
+      {},
+      {},
+      executorch::aten::ScalarType::Float,
+      executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND,
+      [&freed](void* p) {
+        freed = true;
+        delete[] static_cast<float*>(p);
+      });
+  auto view = make_tensor_ptr(*tensor, {}, {}, {}, [tensor](void*) {});
+  tensor.reset();
+  EXPECT_FALSE(freed);
+  EXPECT_EQ(view->const_data_ptr<float>()[2], 3.0f);
+  view.reset();
+  EXPECT_TRUE(freed);
+}
+
 TEST_F(TensorPtrTest, VectorFloatTooSmallExpectDeath) {
   std::vector<float> data(9, 1.f);
   ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2, 5}, std::move(data)); }, "");
@@ -825,3 +1141,167 @@ TEST_F(TensorPtrTest, TensorDataCastingInvalidCast) {
       },
       "");
 }
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt16Type) {
+  std::vector<uint16_t> data = {1u, 65535u, 42u, 0u};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 65535u);
+  EXPECT_EQ(ptr[2], 42u);
+  EXPECT_EQ(ptr[3], 0u);
+}
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt32Type) {
+  std::vector<uint32_t> data = {0u, 123u, 4000000000u};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 0u);
+  EXPECT_EQ(ptr[1], 123u);
+  EXPECT_EQ(ptr[2], 4000000000u);
+}
+
+TEST_F(TensorPtrTest, TensorDataOnlyUInt64Type) {
+  std::vector<uint64_t> data = {0ull, 1ull, 9000000000000000000ull};
+  auto tensor = make_tensor_ptr(std::move(data));
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->strides()[0], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt64);
+  auto ptr = tensor->const_data_ptr<uint64_t>();
+  EXPECT_EQ(ptr[0], 0ull);
+  EXPECT_EQ(ptr[1], 1ull);
+  EXPECT_EQ(ptr[2], 9000000000000000000ull);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataUInt32Type) {
+  std::vector<uint32_t> values = {1u, 4000000000u, 123u};
+  const auto* bytes = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<uint8_t> raw(bytes, bytes + values.size() * sizeof(uint32_t));
+  auto tensor = make_tensor_ptr(
+      {3}, std::move(raw), executorch::aten::ScalarType::UInt32);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 4000000000u);
+  EXPECT_EQ(ptr[2], 123u);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataUInt64Type) {
+  std::vector<uint64_t> values = {0ull, 42ull, 9000000000000000000ull};
+  const auto* bytes = reinterpret_cast<const uint8_t*>(values.data());
+  std::vector<uint8_t> raw(bytes, bytes + values.size() * sizeof(uint64_t));
+  auto tensor = make_tensor_ptr(
+      {3}, std::move(raw), executorch::aten::ScalarType::UInt64);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 3);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt64);
+  auto ptr = tensor->const_data_ptr<uint64_t>();
+  EXPECT_EQ(ptr[0], 0ull);
+  EXPECT_EQ(ptr[1], 42ull);
+  EXPECT_EQ(ptr[2], 9000000000000000000ull);
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataSizeMismatchUInt32ExpectDeath) {
+  std::vector<uint8_t> data(
+      3 * executorch::aten::elementSize(executorch::aten::ScalarType::UInt32) -
+      1);
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({3}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorUint8dataSizeMismatchUInt64ExpectDeath) {
+  std::vector<uint8_t> data(
+      2 * executorch::aten::elementSize(executorch::aten::ScalarType::UInt64) +
+      1);
+  ET_EXPECT_DEATH({ auto _ = make_tensor_ptr({2}, std::move(data)); }, "");
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromInt32ToUInt16) {
+  std::vector<int32_t> data = {-1, 65535, 65536, -65536};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::UInt16);
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], static_cast<uint16_t>(-1));
+  EXPECT_EQ(ptr[1], static_cast<uint16_t>(65535));
+  EXPECT_EQ(ptr[2], static_cast<uint16_t>(65536));
+  EXPECT_EQ(ptr[3], static_cast<uint16_t>(-65536));
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromUInt32ToFloat) {
+  std::vector<uint32_t> data = {0u, 123u, 4000000000u};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::Float);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+  auto ptr = tensor->const_data_ptr<float>();
+  EXPECT_FLOAT_EQ(ptr[0], 0.0f);
+  EXPECT_FLOAT_EQ(ptr[1], 123.0f);
+  EXPECT_FLOAT_EQ(ptr[2], 4000000000.0f);
+}
+
+TEST_F(TensorPtrTest, TensorDataCastingFromFloatToUInt32) {
+  std::vector<float> data = {1.0f, 2.0f};
+  auto tensor =
+      make_tensor_ptr(std::move(data), executorch::aten::ScalarType::UInt32);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt32);
+
+  auto ptr = tensor->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[1], 2u);
+}
+
+TEST_F(TensorPtrTest, MakeTensorPtrFromExistingTensorUInt32) {
+  std::vector<uint32_t> data = {10u, 20u, 30u, 40u};
+  auto tensor = make_tensor_ptr({2, 2}, data);
+  auto alias = make_tensor_ptr(tensor);
+  EXPECT_EQ(alias->dim(), 2);
+  EXPECT_EQ(alias->size(0), 2);
+  EXPECT_EQ(alias->size(1), 2);
+  EXPECT_EQ(alias->scalar_type(), executorch::aten::ScalarType::UInt32);
+  EXPECT_EQ(
+      alias->const_data_ptr<uint32_t>(), tensor->const_data_ptr<uint32_t>());
+}
+
+TEST_F(TensorPtrTest, CloneTensorPtrFromExistingTensorUInt32) {
+  std::vector<uint32_t> data = {10u, 20u, 30u, 40u};
+  auto tensor = make_tensor_ptr({2, 2}, std::move(data));
+  auto cloned = clone_tensor_ptr(tensor);
+  EXPECT_EQ(cloned->dim(), 2);
+  EXPECT_EQ(cloned->size(0), 2);
+  EXPECT_EQ(cloned->size(1), 2);
+  EXPECT_EQ(cloned->scalar_type(), executorch::aten::ScalarType::UInt32);
+  EXPECT_NE(
+      cloned->const_data_ptr<uint32_t>(), tensor->const_data_ptr<uint32_t>());
+  auto ptr = cloned->const_data_ptr<uint32_t>();
+  EXPECT_EQ(ptr[0], 10u);
+  EXPECT_EQ(ptr[3], 40u);
+}
+
+TEST_F(TensorPtrTest, Tensor2DUInt16OwningData) {
+  std::vector<uint16_t> data = {1u, 2u, 3u, 4u, 5u, 6u};
+  auto tensor = make_tensor_ptr({2, 3}, std::move(data));
+  EXPECT_EQ(tensor->dim(), 2);
+  EXPECT_EQ(tensor->size(0), 2);
+  EXPECT_EQ(tensor->size(1), 3);
+  EXPECT_EQ(tensor->strides()[0], 3);
+  EXPECT_EQ(tensor->strides()[1], 1);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::UInt16);
+  auto ptr = tensor->const_data_ptr<uint16_t>();
+  EXPECT_EQ(ptr[0], 1u);
+  EXPECT_EQ(ptr[5], 6u);
+}
diff --git a/extension/testing_util/targets.bzl b/extension/testing_util/targets.bzl
index 05b825645e8..a5ad1fb9b8c 100644
--- a/extension/testing_util/targets.bzl
+++ b/extension/testing_util/targets.bzl
@@ -14,6 +14,7 @@ def define_common_targets():
         visibility = [
             "//executorch/devtools/etdump/tests/...",
             "//executorch/extension/data_loader/test/...",
+            "//executorch/extension/llm/runner/test/...",
             "//executorch/extension/testing_util/test/...",
             "//executorch/extension/fb/ptez/decompression_methods/test/...",
             "//executorch/extension/fb/ptez/test/...",
diff --git a/extension/threadpool/CMakeLists.txt b/extension/threadpool/CMakeLists.txt
index a6c06e84293..3b9c7c66ddb 100644
--- a/extension/threadpool/CMakeLists.txt
+++ b/extension/threadpool/CMakeLists.txt
@@ -20,6 +20,16 @@ if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
 endif()
 
+# Threadpool size specifiers. Mutual exclusion is checking in default.cmake.
+# Default to using performance cores if
+# EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES isn't set.
+set(_threadpool_size_flag)
+if(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
+  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES")
+else()
+  set(_threadpool_size_flag "EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES")
+endif()
+
 add_library(
   extension_threadpool threadpool.cpp threadpool_guard.cpp thread_parallel.cpp
                        cpuinfo_utils.cpp
@@ -36,14 +46,16 @@ target_include_directories(
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include>
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include>
 )
-target_compile_definitions(extension_threadpool PUBLIC ET_USE_THREADPOOL)
+target_compile_definitions(
+  extension_threadpool PUBLIC ET_USE_THREADPOOL ${_threadpool_size_flag}
+)
 target_compile_options(extension_threadpool PUBLIC ${_common_compile_options})
 
 # Install libraries
 install(
   TARGETS extension_threadpool
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/threadpool/targets.bzl b/extension/threadpool/targets.bzl
index 6ef55c42434..1889cb650ad 100644
--- a/extension/threadpool/targets.bzl
+++ b/extension/threadpool/targets.bzl
@@ -22,6 +22,7 @@ def define_common_targets():
         name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
+            ":cpuinfo_utils",
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/portable_type/c10/c10:c10",
         ],
diff --git a/extension/threadpool/test/threadpool_test.cpp b/extension/threadpool/test/threadpool_test.cpp
index e7784d3cc11..052e6c22f5e 100644
--- a/extension/threadpool/test/threadpool_test.cpp
+++ b/extension/threadpool/test/threadpool_test.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/extension/threadpool/threadpool.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <mutex>
 #include <numeric>
@@ -71,6 +72,8 @@ void run_lambda_with_size(
 } // namespace
 
 TEST(ThreadPoolTest, ParallelAdd) {
+  executorch::runtime::runtime_init();
+
   std::vector<int32_t> a, b, c, c_ref;
   size_t vector_size = 100;
   size_t grain_size = 10;
@@ -111,6 +114,8 @@ TEST(ThreadPoolTest, ParallelAdd) {
 
 // Test parallel reduction where we acquire lock within lambda
 TEST(ThreadPoolTest, ParallelReduce) {
+  executorch::runtime::runtime_init();
+
   std::vector<int32_t> a;
   int32_t c = 0, c_ref = 0;
   size_t vector_size = 100;
@@ -144,6 +149,8 @@ TEST(ThreadPoolTest, ParallelReduce) {
 // Copied from
 // caffe2/aten/src/ATen/test/test_thread_pool_guard.cp
 TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
+  executorch::runtime::runtime_init();
+
   auto threadpool_ptr = ::executorch::extension::threadpool::get_pthreadpool();
 
   ASSERT_NE(threadpool_ptr, nullptr);
@@ -173,6 +180,8 @@ TEST(TestNoThreadPoolGuard, TestThreadPoolGuard) {
 }
 
 TEST(TestNoThreadPoolGuard, TestRunWithGuard) {
+  executorch::runtime::runtime_init();
+
   const std::vector<int64_t> array = {1, 2, 3};
 
   auto pool = ::executorch::extension::threadpool::get_threadpool();
diff --git a/extension/threadpool/threadpool.cpp b/extension/threadpool/threadpool.cpp
index 5fee732b053..f4d88e668d6 100644
--- a/extension/threadpool/threadpool.cpp
+++ b/extension/threadpool/threadpool.cpp
@@ -6,17 +6,34 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
 #include <executorch/extension/threadpool/threadpool.h>
 
 #include <algorithm>
-#include <atomic>
 #include <memory>
 
 #include <executorch/extension/threadpool/threadpool_guard.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/runtime.h>
 
 #include <cpuinfo.h>
 
+// At most one mode should be set.
+#if (                                                       \
+    defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
+    defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES))
+#error Multiple \
+            threadpool size specifiers are set.At most one of                \
+    EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES,                             \
+    and EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES may be defined.
+#endif
+
+// Default to EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES if no mode is set.
+#if !defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES) && \
+    !defined(EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES)
+#define EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES 1
+#endif
+
 namespace executorch::extension::threadpool {
 
 #if !(defined(WIN32))
@@ -45,6 +62,8 @@ size_t ThreadPool::get_thread_count() const {
 }
 
 bool ThreadPool::_unsafe_reset_threadpool(uint32_t new_thread_count) {
+  ET_LOG(Info, "Resetting threadpool to %u threads.", new_thread_count);
+
   // No need to do anything if the count is same or 0
   if (new_thread_count == get_thread_count() || new_thread_count == 0) {
     return true;
@@ -96,22 +115,34 @@ void ThreadPool::run(
 // get_threadpool is not thread safe due to leak_corrupted_threadpool
 // Make this part threadsafe: TODO(kimishpatel)
 ThreadPool* get_threadpool() {
+  executorch::runtime::runtime_init();
+
   if (!cpuinfo_initialize()) {
     ET_LOG(Error, "cpuinfo initialization failed");
     return nullptr; // NOLINT(facebook-hte-NullableReturn)
   }
 
-  int num_threads = cpuinfo_get_processors_count();
-  /*
-   * For llvm-tsan, holding limit for the number of locks for a single thread
-   * is 63 (because of comparison < 64 instead of <=). pthreadpool's worst
-   * case is the number of threads in a pool. So we want to limit the threadpool
-   * size to 64 when running with tsan. However, sometimes it is tricky to
-   * detect if we are running under tsan, for now capping the default
-   * threadcount to the tsan limit unconditionally.
-   */
-  constexpr int tsan_thread_limit = 63;
-  num_threads = std::min(num_threads, tsan_thread_limit);
+  static const int num_threads = ([]() {
+#if defined(EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES)
+    // Use threads=cores.
+    auto result = cpuinfo_get_processors_count();
+#else
+    // Set threads equal to the number of performance cores.
+    auto result = ::executorch::extension::cpuinfo::get_num_performant_cores();
+#endif
+
+    /*
+     * For llvm-tsan, holding limit for the number of locks for a single thread
+     * is 63 (because of comparison < 64 instead of <=). pthreadpool's worst
+     * case is the number of threads in a pool. So we want to limit the
+     * threadpool size to 64 when running with tsan. However, sometimes it is
+     * tricky to detect if we are running under tsan, for now capping the
+     * default threadcount to the tsan limit unconditionally.
+     */
+    constexpr unsigned int tsan_thread_limit = 63;
+    return std::min(result, tsan_thread_limit);
+  })();
+
   static auto threadpool = std::make_unique<ThreadPool>(num_threads);
 
 // Inheriting from old threadpool to get around segfault issue
diff --git a/extension/threadpool/threadpool.h b/extension/threadpool/threadpool.h
index 3ad2d1d48d4..16acad6e5fa 100644
--- a/extension/threadpool/threadpool.h
+++ b/extension/threadpool/threadpool.h
@@ -14,6 +14,22 @@
 
 #include <pthreadpool.h>
 
+/*
+ * Threadpool Options:
+ *
+ * Threadpool size has a sizble affect on performance. By default, the
+ * threadpool will be sized according to the number of performance cores. This
+ * behavior can be overriden with the following build-time options. Note that
+ * these options are mutually exclusive.
+ *
+ * - EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES (flag) - Sizes the threadpool
+ * equal to the number of performance cores on the system. This is the default
+ * behavior.
+ * - EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES (flag) - Sizes the threadpool
+ * equal to the number of logical cores on system. This is the historical
+ * behavior.
+ */
+
 namespace executorch::extension::threadpool {
 
 class ThreadPool final {
diff --git a/extension/training/CMakeLists.txt b/extension/training/CMakeLists.txt
index 1e17913141d..8f572514aa5 100644
--- a/extension/training/CMakeLists.txt
+++ b/extension/training/CMakeLists.txt
@@ -70,7 +70,14 @@ if(EXECUTORCH_BUILD_PYBIND)
 
   target_include_directories(_training_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(
-    _training_lib PUBLIC -Wno-deprecated-declarations -fPIC -frtti -fexceptions
+    _training_lib
+    PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc
+           /GR
+           /wd4996>
+           $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wno-deprecated-declarations
+           -fPIC
+           -frtti
+           -fexceptions>
   )
   target_link_libraries(_training_lib PRIVATE ${_pybind_training_dep_libs})
 
@@ -83,7 +90,7 @@ endif()
 install(
   TARGETS extension_training
   EXPORT ExecuTorchTargets
-  DESTINATION lib
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
diff --git a/extension/training/examples/CIFAR/data_utils.py b/extension/training/examples/CIFAR/data_utils.py
index e683581ab8a..179271b6688 100644
--- a/extension/training/examples/CIFAR/data_utils.py
+++ b/extension/training/examples/CIFAR/data_utils.py
@@ -277,7 +277,7 @@ def parse_args() -> argparse.Namespace:
     Parse command line arguments for the CIFAR-10 training script.
 
     This function sets up an argument parser with various configuration options
-    for training a CIFAR-10 model with ExecutorTorch, including data paths,
+    for training a CIFAR-10 model with ExecuTorch, including data paths,
     training hyperparameters, and model save locations.
 
     Returns:
diff --git a/extension/training/examples/CIFAR/export.py b/extension/training/examples/CIFAR/export.py
index ea388019864..02708b2d695 100644
--- a/extension/training/examples/CIFAR/export.py
+++ b/extension/training/examples/CIFAR/export.py
@@ -27,12 +27,12 @@ def export_model_combined(
     with_external_tensor_data: bool = False,
 ) -> ExecuTorchModule:
     """
-    Export a PyTorch model to an ExecutorTorch module format, optionally with external tensor data.
+    Export a PyTorch model to an ExecuTorch module format, optionally with external tensor data.
 
     This function takes a PyTorch model and sample input/label tensors,
     wraps the model with a loss function, exports it using torch.export,
     applies forward-backward pass optimization, converts it to edge format,
-    and finally to ExecutorTorch format. If with_external_tensor_data is True,
+    and finally to ExecuTorch format. If with_external_tensor_data is True,
     the model will be exported with external constants and mutable weights.
 
     TODO: set dynamic shape for the batch size here.
@@ -45,7 +45,7 @@ def export_model_combined(
             Defaults to False.
 
     Returns:
-        ExecuTorchModule: The exported model in ExecutorTorch format ready for deployment
+        ExecuTorchModule: The exported model in ExecuTorch format ready for deployment
     """
     criterion = torch.nn.CrossEntropyLoss()
     model_with_loss = ModelWithLoss(net, criterion)
@@ -72,17 +72,17 @@ def export_model_combined(
 
 def get_pte_only(net: torch.nn.Module) -> ExecuTorchModule:
     """
-    Generate an ExecutorTorch module from a PyTorch model without external tensor data.
+    Generate an ExecuTorch module from a PyTorch model without external tensor data.
 
     This function retrieves a sample input and label tensor from the test data loader,
-    and uses them to export the given PyTorch model to an ExecutorTorch module format
+    and uses them to export the given PyTorch model to an ExecuTorch module format
     without external constants or mutable weights.
 
     Args:
         net (torch.nn.Module): The PyTorch model to be exported.
 
     Returns:
-        ExecuTorchModule: The exported model in ExecutorTorch format.
+        ExecuTorchModule: The exported model in ExecuTorch format.
     """
     _, test_loader = get_data_loaders()
     # get a sample input and label tensor
@@ -95,17 +95,17 @@ def get_pte_only(net: torch.nn.Module) -> ExecuTorchModule:
 
 def get_pte_with_ptd(net: torch.nn.Module) -> ExecuTorchModule:
     """
-    Generate an ExecutorTorch module from a PyTorch model with external tensor data.
+    Generate an ExecuTorch module from a PyTorch model with external tensor data.
 
     This function retrieves a sample input and label tensor from the test data loader,
-    and uses them to export the given PyTorch model to an ExecutorTorch module format
+    and uses them to export the given PyTorch model to an ExecuTorch module format
     with external constants and mutable weights.
 
     Args:
         net (torch.nn.Module): The PyTorch model to be exported.
 
     Returns:
-        ExecuTorchModule: The exported model in ExecutorTorch format with external tensor data.
+        ExecuTorchModule: The exported model in ExecuTorch format with external tensor data.
     """
     _, test_loader = get_data_loaders()
     # get a sample input and label tensor
@@ -121,7 +121,7 @@ def export_model(
     with_ptd: bool = False,
 ) -> ExecuTorchModule:
     """
-    Export a PyTorch model to ExecutorTorch format, optionally with external tensor data.
+    Export a PyTorch model to ExecuTorch format, optionally with external tensor data.
 
     This function is a high-level wrapper that handles getting sample data and
     calling the appropriate export function based on the with_ptd flag.
@@ -132,7 +132,7 @@ def export_model(
             Defaults to False.
 
     Returns:
-        ExecuTorchModule: The exported model in ExecutorTorch format
+        ExecuTorchModule: The exported model in ExecuTorch format
     """
     _, test_loader = get_data_loaders()
     validation_sample_data = next(iter(test_loader))
@@ -145,13 +145,13 @@ def export_model(
 
 def save_model(ep: ExecuTorchModule, model_path: str) -> None:
     """
-    Save an ExecutorTorch model to a specified file path.
+    Save an ExecuTorch model to a specified file path.
 
-    This function writes the buffer of an ExecutorTorchModule to a
+    This function writes the buffer of an ExecuTorchModule to a
     file in binary format.
 
     Args:
-        ep (ExecuTorchModule): The ExecutorTorch module to be saved.
+        ep (ExecuTorchModule): The ExecuTorch module to be saved.
         model_path (str): The file path where the model will be saved.
     """
     with open(model_path, "wb") as file:
diff --git a/extension/training/examples/CIFAR/main.py b/extension/training/examples/CIFAR/main.py
index c039cfa4ae8..1f9c8637d2d 100644
--- a/extension/training/examples/CIFAR/main.py
+++ b/extension/training/examples/CIFAR/main.py
@@ -28,7 +28,7 @@ def parse_args() -> argparse.Namespace:
     Parse command line arguments for the CIFAR-10 training script.
 
     This function sets up an argument parser with various configuration options
-    for training a CIFAR-10 model with ExecutorTorch, including data paths,
+    for training a CIFAR-10 model with ExecuTorch, including data paths,
     training hyperparameters, and model save locations.
 
     Returns:
diff --git a/extension/training/examples/CIFAR/train_utils.py b/extension/training/examples/CIFAR/train_utils.py
index baed740d938..9bc2b871e21 100644
--- a/extension/training/examples/CIFAR/train_utils.py
+++ b/extension/training/examples/CIFAR/train_utils.py
@@ -195,15 +195,15 @@ def fine_tune_executorch_model(
     momentum: float = 0.9,
 ) -> tuple[ExecuTorchModule, typing.Dict[int, typing.Dict[str, float]]]:
     """
-    Fine-tune an ExecutorTorch model using a training and validation dataset.
+    Fine-tune an ExecuTorch model using a training and validation dataset.
 
-    This function loads an ExecutorTorch model from a file, fine-tunes it using
+    This function loads an ExecuTorch model from a file, fine-tunes it using
     the provided training data loader, and evaluates it on the validation data
     loader. The function returns the fine-tuned model and a history dictionary
     containing training and validation metrics.
 
     Args:
-        model_path (str): Path to the ExecutorTorch model file to be
+        model_path (str): Path to the ExecuTorch model file to be
         fine-tuned.
         save_path (str): Path where the fine-tuned model will be saved.
         train_loader (DataLoader): DataLoader for the training dataset.
@@ -215,7 +215,7 @@ def fine_tune_executorch_model(
         (default: 0.9).
 
     Returns:
-        tuple: A tuple containing the fine-tuned ExecutorTorchModule
+        tuple: A tuple containing the fine-tuned ExecuTorchModule
                and a dictionary with training and validation metrics.
     """
     with open(model_path, "rb") as f:
@@ -335,7 +335,7 @@ def train_both_models(
     typing.Dict[int, typing.Dict[str, float]],
 ]:
     """
-    Train both a PyTorch model and an ExecutorTorch model simultaneously using the same data.
+    Train both a PyTorch model and an ExecuTorch model simultaneously using the same data.
 
     This function trains both models in parallel, using the same data batches for both,
     which makes debugging and comparison easier. It tracks metrics for both models
@@ -343,7 +343,7 @@ def train_both_models(
 
     Args:
         pytorch_model (torch.nn.Module): The PyTorch model to be trained
-        et_model_path (str): Path to the ExecutorTorch model file
+        et_model_path (str): Path to the ExecuTorch model file
         train_loader (DataLoader): DataLoader for the training dataset
         test_loader (DataLoader): DataLoader for the testing/validation dataset
         epochs (int, optional): Number of epochs for training. Defaults to 10.
@@ -354,11 +354,11 @@ def train_both_models(
     Returns:
         tuple: A tuple containing:
             - The trained PyTorch model
-            - The trained ExecutorTorch model
+            - The trained ExecuTorch model
             - Dictionary with PyTorch training and validation metrics
-            - Dictionary with ExecutorTorch training and validation metrics
+            - Dictionary with ExecuTorch training and validation metrics
     """
-    # Load the ExecutorTorch model
+    # Load the ExecuTorch model
     with open(et_model_path, "rb") as f:
         model_bytes = f.read()
         et_mod = _load_for_executorch_for_training_from_buffer(model_bytes)
@@ -442,7 +442,7 @@ def train_both_models(
             # Accumulate loss
             pytorch_epoch_loss += pytorch_loss.detach().item()
 
-            # ---- ExecutorTorch model training ----
+            # ---- ExecuTorch model training ----
             et_start_time = time.time()
 
             # Forward pass
@@ -476,7 +476,7 @@ def train_both_models(
             f"PyTorch - Train Loss: {avg_pytorch_train_loss:.4f}, Train Accuracy: {pytorch_train_accuracy:.2f}%"
         )
         print(
-            f"ExecutorTorch - Train Loss: {avg_et_train_loss:.4f}, Train Accuracy: {et_train_accuracy:.2f}%"
+            f"ExecuTorch - Train Loss: {avg_et_train_loss:.4f}, Train Accuracy: {et_train_accuracy:.2f}%"
         )
 
         # Testing/Validation phase
@@ -513,7 +513,7 @@ def train_both_models(
                 pytorch_test_correct += (pytorch_predicted == labels).sum().item()
                 pytorch_test_total += batch_size
 
-                # ---- ExecutorTorch model testing ----
+                # ---- ExecuTorch model testing ----
                 et_test_start = time.time()
 
                 et_out = et_mod.forward_backward(
@@ -540,7 +540,7 @@ def train_both_models(
             f"PyTorch - Test Loss: {avg_pytorch_test_loss:.4f}, Test Accuracy: {pytorch_test_accuracy:.2f}%"
         )
         print(
-            f"ExecutorTorch - Test Loss: {avg_et_test_loss:.4f}, Test Accuracy: {et_test_accuracy:.2f}%"
+            f"ExecuTorch - Test Loss: {avg_et_test_loss:.4f}, Test Accuracy: {et_test_accuracy:.2f}%"
         )
 
         # Compare losses
@@ -555,16 +555,14 @@ def train_both_models(
                 f"New best PyTorch model saved with test loss: {avg_pytorch_test_loss:.4f}"
             )
 
-        # Save the best ExecutorTorch model
+        # Save the best ExecuTorch model
         if avg_et_test_loss < best_et_test_loss:
             best_et_test_loss = avg_et_test_loss
-            # Save the ExecutorTorch model
+            # Save the ExecuTorch model
             save_dir = os.path.dirname(et_save_path)
             if save_dir and not os.path.exists(save_dir):
                 os.makedirs(save_dir)
-            print(
-                f"New best ExecutorTorch model with test loss: {avg_et_test_loss:.4f}"
-            )
+            print(f"New best ExecuTorch model with test loss: {avg_et_test_loss:.4f}")
 
         # Store history for both models
         pytorch_history[epoch] = {
@@ -605,7 +603,7 @@ def train_both_models(
             f"PyTorch training time: {pytorch_train_time:.4f}s, testing time: {pytorch_test_time:.4f}s"
         )
         print(
-            f"ExecutorTorch training time: {et_train_time:.4f}s, testing time: {et_test_time:.4f}s"
+            f"ExecuTorch training time: {et_train_time:.4f}s, testing time: {et_test_time:.4f}s"
         )
         print(f"Training time ratio (ET/PT): {et_train_time/pytorch_train_time:.4f}")
         print(f"Testing time ratio (ET/PT): {et_test_time/pytorch_test_time:.4f}")
@@ -613,12 +611,12 @@ def train_both_models(
     print("\nTraining Completed!\n")
     print("\n###########SUMMARY#############\n")
     print(f"Best PyTorch test loss: {best_pytorch_test_loss:.4f}")
-    print(f"Best ExecutorTorch test loss: {best_et_test_loss:.4f}")
+    print(f"Best ExecuTorch test loss: {best_et_test_loss:.4f}")
     print(
         f"Final loss difference: {abs(best_pytorch_test_loss - best_et_test_loss):.6f}"
     )
     print(f"PyTorch model saved at: {pytorch_save_path}")
-    print(f"ExecutorTorch model path: {et_save_path}")
+    print(f"ExecuTorch model path: {et_save_path}")
     print("################################\n")
 
     return pytorch_model, et_mod, pytorch_history, et_history
diff --git a/extension/wasm/CMakeLists.txt b/extension/wasm/CMakeLists.txt
index 36c336e17c5..8ffd1801c63 100644
--- a/extension/wasm/CMakeLists.txt
+++ b/extension/wasm/CMakeLists.txt
@@ -27,7 +27,15 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror)
+set(_common_compile_options
+    $<$<CXX_COMPILER_ID:MSVC>:/W4
+    /WX
+    /wd4996>
+    $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wall
+    -Werror
+    -Wno-deprecated-declarations
+    -fPIC>
+)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 set(link_libraries)
diff --git a/install_requirements.py b/install_requirements.py
index cbae175e276..a026e5b9964 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -7,71 +7,16 @@
 
 import argparse
 import os
-import platform
-import re
 import subprocess
 import sys
 
+from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-def python_is_compatible():
-    # Scrape the version range from pyproject.toml, which should be in the current directory.
-    version_specifier = None
-    with open("pyproject.toml", "r") as file:
-        for line in file:
-            if line.startswith("requires-python"):
-                match = re.search(r'"([^"]*)"', line)
-                if match:
-                    version_specifier = match.group(1)
-                    break
-
-    if not version_specifier:
-        print(
-            "WARNING: Skipping python version check: version range not found",
-            file=sys.stderr,
-        )
-        return False
-
-    # Install the packaging module if necessary.
-    try:
-        import packaging
-    except ImportError:
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "packaging"], check=True
-        )
-    # Compare the current python version to the range in version_specifier. Exits
-    # with status 1 if the version is not compatible, or with status 0 if the
-    # version is compatible or the logic itself fails.
-    try:
-        import packaging.specifiers
-        import packaging.version
-
-        python_version = packaging.version.parse(platform.python_version())
-        version_range = packaging.specifiers.SpecifierSet(version_specifier)
-        if python_version not in version_range:
-            print(
-                f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"',
-                file=sys.stderr,
-            )
-            return False
-    except Exception as e:
-        print(f"WARNING: Skipping python version check: {e}", file=sys.stderr)
-    return True
-
+from torch_pin import NIGHTLY_VERSION, SUPPORTED_CUDA_VERSIONS, TORCH_VERSION
 
 # The pip repository that hosts nightly torch packages.
-TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
-
-
-# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
-# pip versions will have the required features.
-#
-# NOTE: If a newly-fetched version of the executorch repo changes the value of
-# NIGHTLY_VERSION, you should re-run this script to install the necessary
-# package versions.
-#
-# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
-# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250906"
+# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
+TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -84,12 +29,19 @@ def install_requirements(use_pytorch_nightly):
         )
         sys.exit(1)
 
+    # Determine the appropriate PyTorch URL based on CUDA delegate status
+    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
+
     # pip packages needed by exir.
     TORCH_PACKAGE = [
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.9.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        (
+            f"torch=={TORCH_VERSION}.{NIGHTLY_VERSION}"
+            if use_pytorch_nightly
+            else "torch"
+        ),
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -105,7 +57,7 @@ def install_requirements(use_pytorch_nightly):
             "requirements-dev.txt",
             *TORCH_PACKAGE,
             "--extra-index-url",
-            TORCH_NIGHTLY_URL,
+            torch_url,
         ],
         check=True,
     )
@@ -147,10 +99,13 @@ def install_requirements(use_pytorch_nightly):
 
 
 def install_optional_example_requirements(use_pytorch_nightly):
+    # Determine the appropriate PyTorch URL based on CUDA delegate status
+    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
+
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
         (
-            f"torchvision==0.24.0.{NIGHTLY_VERSION}"
+            f"torchvision==0.25.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchvision"
         ),
@@ -165,7 +120,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
             "install",
             *DOMAIN_LIBRARIES,
             "--extra-index-url",
-            TORCH_NIGHTLY_URL,
+            torch_url,
         ],
         check=True,
     )
@@ -180,7 +135,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
             "-r",
             "requirements-examples.txt",
             "--extra-index-url",
-            TORCH_NIGHTLY_URL,
+            torch_url,
             "--upgrade-strategy",
             "only-if-needed",
         ],
@@ -188,17 +143,6 @@ def install_optional_example_requirements(use_pytorch_nightly):
     )
 
 
-# Prebuilt binaries for Intel-based macOS are no longer available on PyPI; users must compile from source.
-# PyTorch stopped building macOS x86_64 binaries since version 2.3.0 (January 2024).
-def is_intel_mac_os():
-    # Returns True if running on Intel macOS.
-    return platform.system().lower() == "darwin" and platform.machine().lower() in (
-        "x86",
-        "x86_64",
-        "i386",
-    )
-
-
 def main(args):
     parser = argparse.ArgumentParser()
     parser.add_argument(
diff --git a/install_utils.py b/install_utils.py
new file mode 100644
index 00000000000..113005ba1e4
--- /dev/null
+++ b/install_utils.py
@@ -0,0 +1,191 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-25 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import os
+import platform
+import re
+import subprocess
+import sys
+
+
+def _is_cuda_enabled():
+    """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable."""
+    cmake_args = os.environ.get("CMAKE_ARGS", "")
+    return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args
+
+
+def _cuda_version_to_pytorch_suffix(major, minor):
+    """
+    Generate PyTorch CUDA wheel suffix from CUDA version numbers.
+
+    Args:
+        major: CUDA major version (e.g., 12)
+        minor: CUDA minor version (e.g., 6)
+
+    Returns:
+        PyTorch wheel suffix string (e.g., "cu126")
+    """
+    return f"cu{major}{minor}"
+
+
+def _get_cuda_version(supported_cuda_versions):
+    """
+    Get the CUDA version installed on the system using nvcc command.
+    Returns a tuple (major, minor).
+
+    Args:
+        supported_cuda_versions: List of supported CUDA versions as tuples
+
+    Raises:
+        RuntimeError: if nvcc is not found or version cannot be parsed
+    """
+    try:
+        # Get CUDA version from nvcc (CUDA compiler)
+        nvcc_result = subprocess.run(
+            ["nvcc", "--version"], capture_output=True, text=True, check=True
+        )
+        # Parse nvcc output for CUDA version
+        # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68"
+        match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+
+            # Check if the detected version is supported
+            if (major, minor) not in supported_cuda_versions:
+                available_versions = ", ".join(
+                    [f"{maj}.{min}" for maj, min in supported_cuda_versions]
+                )
+                raise RuntimeError(
+                    f"Detected CUDA version {major}.{minor} is not supported. "
+                    f"Only the following CUDA versions are supported: {available_versions}. "
+                    f"Please install a supported CUDA version or try on CPU-only delegates."
+                )
+
+            return (major, minor)
+        else:
+            raise RuntimeError(
+                "CUDA delegate is enabled but could not parse CUDA version from nvcc output. "
+                "Please ensure CUDA is properly installed or try on CPU-only delegates."
+            )
+    except FileNotFoundError:
+        raise RuntimeError(
+            "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. "
+            "Please install CUDA toolkit or try on CPU-only delegates."
+        )
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(
+            f"CUDA delegate is enabled but nvcc command failed with error: {e}. "
+            "Please ensure CUDA is properly installed or try on CPU-only delegates."
+        )
+
+
+def _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base):
+    """
+    Get the appropriate PyTorch CUDA URL for the given CUDA version.
+
+    Args:
+        cuda_version: tuple of (major, minor) version numbers
+        torch_nightly_url_base: Base URL for PyTorch nightly packages
+
+    Returns:
+        URL string for PyTorch CUDA packages
+    """
+    major, minor = cuda_version
+    # Generate CUDA suffix (version validation is already done in _get_cuda_version)
+    cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor)
+
+    return f"{torch_nightly_url_base}/{cuda_suffix}"
+
+
+@functools.lru_cache(maxsize=1)
+def determine_torch_url(torch_nightly_url_base, supported_cuda_versions):
+    """
+    Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS.
+    Uses @functools.lru_cache to avoid redundant CUDA detection and print statements.
+
+    Args:
+        torch_nightly_url_base: Base URL for PyTorch nightly packages
+        supported_cuda_versions: List of supported CUDA versions as tuples
+
+    Returns:
+        URL string for PyTorch packages
+    """
+    # Check if CUDA delegate is enabled
+    if not _is_cuda_enabled():
+        print("CUDA delegate not enabled, using CPU-only PyTorch")
+        return f"{torch_nightly_url_base}/cpu"
+
+    print("CUDA delegate enabled, detecting CUDA version...")
+
+    # Get CUDA version
+    cuda_version = _get_cuda_version(supported_cuda_versions)
+
+    major, minor = cuda_version
+    print(f"Detected CUDA version: {major}.{minor}")
+
+    # Get appropriate PyTorch CUDA URL
+    torch_url = _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base)
+    print(f"Using PyTorch URL: {torch_url}")
+
+    return torch_url
+
+
+# Prebuilt binaries for Intel-based macOS are no longer available on PyPI; users must compile from source.
+# PyTorch stopped building macOS x86_64 binaries since version 2.3.0 (January 2024).
+def is_intel_mac_os():
+    # Returns True if running on Intel macOS.
+    return platform.system().lower() == "darwin" and platform.machine().lower() in (
+        "x86",
+        "x86_64",
+        "i386",
+    )
+
+
+def python_is_compatible():
+    # Scrape the version range from pyproject.toml, which should be in the current directory.
+    version_specifier = None
+    with open("pyproject.toml", "r") as file:
+        for line in file:
+            if line.startswith("requires-python"):
+                match = re.search(r'"([^"]*)"', line)
+                if match:
+                    version_specifier = match.group(1)
+                    break
+
+    if not version_specifier:
+        print(
+            "WARNING: Skipping python version check: version range not found",
+            file=sys.stderr,
+        )
+        return False
+
+    # Install the packaging module if necessary.
+    try:
+        import packaging
+    except ImportError:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "packaging"], check=True
+        )
+    # Compare the current python version to the range in version_specifier. Exits
+    # with status 1 if the version is not compatible, or with status 0 if the
+    # version is compatible or the logic itself fails.
+    try:
+        import packaging.specifiers
+        import packaging.version
+
+        python_version = packaging.version.parse(platform.python_version())
+        version_range = packaging.specifiers.SpecifierSet(version_specifier)
+        if python_version not in version_range:
+            print(
+                f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"',
+                file=sys.stderr,
+            )
+            return False
+    except Exception as e:
+        print(f"WARNING: Skipping python version check: {e}", file=sys.stderr)
+    return True
diff --git a/kernels/aten/cpu/op__clone_dim_order.cpp b/kernels/aten/cpu/op__clone_dim_order.cpp
new file mode 100644
index 00000000000..5e6c35d64f9
--- /dev/null
+++ b/kernels/aten/cpu/op__clone_dim_order.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+using SizesArrayRef = executorch::aten::ArrayRef<executorch::aten::SizesType>;
+using DimOrderArrayRef =
+    executorch::aten::ArrayRef<executorch::aten::DimOrderType>;
+using MemoryFormat = executorch::aten::MemoryFormat;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+template <typename T>
+using Optional = std::optional<T>;
+
+namespace {
+Optional<MemoryFormat> get_memory_format(OptionalArrayRef<int64_t> dim_order) {
+  if (!dim_order.has_value()) {
+    return executorch::aten::nullopt;
+  }
+  if (is_contiguous_dim_order(
+          dim_order.value().data(), dim_order.value().size())) {
+    return MemoryFormat::Contiguous;
+  } else if (is_channels_last_dim_order(
+                 dim_order.value().data(), dim_order.value().size())) {
+    return MemoryFormat::ChannelsLast;
+  } else {
+    ET_ASSERT_UNREACHABLE();
+  }
+}
+
+bool check__clone_dim_order_args(
+    const Tensor& input,
+    bool non_blocking,
+    executorch::aten::OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // Right now we only support blocking data transfer
+  ET_LOG_AND_RETURN_IF_FALSE(non_blocking == false);
+
+  // Ensure input and output dtype match
+  ET_LOG_AND_RETURN_IF_FALSE(input.scalar_type() == out.scalar_type());
+
+  // dim_order is set, the target dim_order will be either contiguous or
+  // channels_last memory format
+  if (dim_order.has_value()) {
+    executorch::aten::ArrayRef<int64_t> dim_order_ref = dim_order.value();
+
+    // dim order size shall equal to input dim
+    ET_LOG_AND_RETURN_IF_FALSE(dim_order_ref.size() == input.dim());
+
+    ET_LOG_AND_RETURN_IF_FALSE(
+        is_channels_last_dim_order(
+            dim_order.value().data(), dim_order.value().size()) ||
+        is_contiguous_dim_order(
+            dim_order.value().data(), dim_order.value().size()));
+
+    // Out Aten tensor shall have same memory format stride as dim_order
+    const size_t kMaxNumOfDimensions = 16;
+    ET_LOG_AND_RETURN_IF_FALSE(kMaxNumOfDimensions >= out.dim());
+    executorch::aten::StridesType target_strides[kMaxNumOfDimensions];
+    dim_order_to_stride_nocheck(
+        out.sizes().data(),
+        dim_order_ref.data(),
+        dim_order_ref.size(),
+        target_strides);
+    ET_LOG_AND_RETURN_IF_FALSE(out.dim() == dim_order_ref.size());
+    for (size_t i = 0; i < dim_order_ref.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(target_strides[i] == out.strides()[i]);
+    }
+
+  } else { // dim_order is not set, preserve the dim order of input
+
+    auto out_strides = out.strides();
+    auto input_strides = input.strides();
+    ET_LOG_AND_RETURN_IF_FALSE(input_strides.size() == out_strides.size());
+    for (size_t i = 0; i < input_strides.size(); i++) {
+      ET_LOG_AND_RETURN_IF_FALSE(input_strides[i] == out_strides[i]);
+    }
+  }
+  return true;
+}
+} // namespace
+
+// _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+// dim_order=None, Tensor(a!) out) -> Tensor(a!)
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  // TODO(T181345875): enable sanity check in aten mode
+  ET_KERNEL_CHECK(
+      ctx,
+      check__clone_dim_order_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  Optional<MemoryFormat> memory_format = get_memory_format(dim_order);
+  at::clone_outf(self, memory_format, out);
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  KernelRuntimeContext ctx{};
+  return _clone_dim_order_out(ctx, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/aten/cpu/targets.bzl b/kernels/aten/cpu/targets.bzl
index bb7083c1f01..e39bbdd144d 100644
--- a/kernels/aten/cpu/targets.bzl
+++ b/kernels/aten/cpu/targets.bzl
@@ -18,6 +18,12 @@ _EDGE_DIALECT_OPS = (
             "//executorch/kernels/aten/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op__clone_dim_order",
+        deps = [
+            "//executorch/kernels/aten/cpu/util:copy_ops_util",
+        ],
+    ),
 )
 
 def define_common_targets():
diff --git a/kernels/aten/edge_dialect_aten_op.yaml b/kernels/aten/edge_dialect_aten_op.yaml
index d9de3f6dded..1a74b3c71d1 100644
--- a/kernels/aten/edge_dialect_aten_op.yaml
+++ b/kernels/aten/edge_dialect_aten_op.yaml
@@ -11,3 +11,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::_to_dim_order_copy_out
+
+- func: dim_order_ops::_clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_clone_dim_order_out
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 32ae865bfdf..f87e2c8d722 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -83,6 +83,7 @@ install(
   # it.
   TARGETS cpublas optimized_kernels optimized_ops_lib eigen_blas
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/optimized/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/optimized/
 )
diff --git a/kernels/optimized/External/EigenBLAS.cmake b/kernels/optimized/External/EigenBLAS.cmake
index 29d42478798..bc09786bed4 100644
--- a/kernels/optimized/External/EigenBLAS.cmake
+++ b/kernels/optimized/External/EigenBLAS.cmake
@@ -53,6 +53,6 @@ set_property(TARGET eigen_blas PROPERTY POSITION_INDEPENDENT_CODE ON)
 
 install(
   TARGETS eigen_blas
-  LIBRARY DESTINATION lib
-  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index eb8475b8d5a..a3ab1654ee5 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -91,13 +91,14 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   install(
     TARGETS optimized_portable_kernels optimized_portable_ops_lib
     EXPORT ExecuTorchTargets
-    DESTINATION lib
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
 
 install(
   TARGETS portable_kernels portable_ops_lib
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/portable/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/portable/
 )
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
index 8607c36204d..dc6ed9ac26f 100644
--- a/kernels/prim_ops/register_prim_ops.cpp
+++ b/kernels/prim_ops/register_prim_ops.cpp
@@ -12,6 +12,18 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 
+/*
+For internal builds using buck rules, the target that depends on
+selective prim ops, will manage its own artifacts. It is in the
+artifacts directory where the geneated selected_prim_ops.h resides
+and thus compilation sources must be copied there including
+selective_build_prim_ops.h. Hence it does not have fully qualified
+name unlike the header files above.
+*/
+#ifdef ET_PRIM_OPS_SELECTIVE_BUILD
+#include "selective_build_prim_ops.h"
+#endif
+
 #include <algorithm>
 #include <cmath>
 
@@ -87,6 +99,8 @@ void floor_div_double(double a, double b, EValue& out) {
 }
 
 static Kernel prim_ops[] = {
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_ATEN_SYM_SIZE_INT)
     // aten::sym_size.int(Tensor self, int dim) -> SymInt
     Kernel(
         "aten::sym_size.int",
@@ -108,6 +122,9 @@ static Kernel prim_ops[] = {
           int64_t size = self_tensor.size(dim_val);
           out = EValue(size);
         }),
+#endif
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_ATEN_LOCAL_SCALAR_DENSE)
     // aten::_local_scalar_dense(Tensor self) -> Scalar
     Kernel(
         "aten::_local_scalar_dense",
@@ -134,6 +151,9 @@ static Kernel prim_ops[] = {
                 out = EValue(Scalar(self_tensor.const_data_ptr<CTYPE>()[0]));
               });
         }),
+#endif
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_ATEN_SYM_NUMEL)
     // aten::sym_numel(Tensor self) -> SymInt
     Kernel(
         "aten::sym_numel",
@@ -153,6 +173,9 @@ static Kernel prim_ops[] = {
           int64_t numel = self_tensor.numel();
           out = EValue(numel);
         }),
+#endif
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_SYM_MAX_SCALAR)
     // executorch_prim::sym_max.Scalar(SymInt a, SymInt b) -> SymInt
     Kernel(
         "executorch_prim::sym_max.Scalar",
@@ -182,6 +205,9 @@ static Kernel prim_ops[] = {
                 (size_t)b.tag);
           }
         }),
+#endif
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_SYM_MIN_SCALAR)
     // executorch_prim::sym_min.Scalar(SymInt a, SymInt b) -> SymInt
     Kernel(
         "executorch_prim::sym_min.Scalar",
@@ -210,27 +236,39 @@ static Kernel prim_ops[] = {
                 (size_t)b.tag);
           }
         }),
+#endif
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_ADD_SCALAR)
     // executorch_prim::add.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::add.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           ALGEBRA_ET_PRIM_OP(+, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_SUB_SCALAR)
     // executorch_prim::sub.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::sub.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           ALGEBRA_ET_PRIM_OP(-, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_MUL_SCALAR)
     // executorch_prim::mul.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::mul.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           ALGEBRA_ET_PRIM_OP(*, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_FLOORDIV_SCALAR)
     /**
      * Python's __floordiv__ operator is more complicated than just floor(a /
      * b). It aims to maintain the property: a == (a // b) * b + remainder(a, b)
@@ -280,8 +318,11 @@ static Kernel prim_ops[] = {
                 (size_t)b.tag);
           }
         }),
+#endif
 
-    // executorch_prim::floordiv.Scalar(Scalar, Scalar) -> Scalar
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_TRUEDIV_SCALAR)
+    // executorch_prim::truediv.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::truediv.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
@@ -318,7 +359,10 @@ static Kernel prim_ops[] = {
                 (size_t)b.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_SYM_FLOAT_SCALAR)
     // executorch_prim::sym_float.Scalar(Scalar) -> Scalar
     Kernel(
         "executorch_prim::sym_float.Scalar",
@@ -346,41 +390,60 @@ static Kernel prim_ops[] = {
                 context, false, InvalidType, /* void */, "%zu", (size_t)a.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_EQ_SCALAR)
     // executorch_prim::eq.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::eq.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           BOOLEAN_ET_PRIM_OP(==, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_GT_SCALAR)
     // executorch_prim::gt.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::gt.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           BOOLEAN_ET_PRIM_OP(>, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_LT_SCALAR)
     // executorch_prim::lt.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::lt.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           BOOLEAN_ET_PRIM_OP(<, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_GE_SCALAR)
     // executorch_prim::ge.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::ge.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           BOOLEAN_ET_PRIM_OP(>=, stack, context);
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_LE_SCALAR)
     // executorch_prim::le.Scalar(Scalar, Scalar) -> bool
     Kernel(
         "executorch_prim::le.Scalar",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           BOOLEAN_ET_PRIM_OP(<=, stack, context);
         }),
+#endif
+
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_NEG_SCALAR)
     // executorch_prim::neg.Scalar(Scalar) -> Scalar
     Kernel(
         "executorch_prim::neg.Scalar",
@@ -404,7 +467,10 @@ static Kernel prim_ops[] = {
                 context, false, InvalidType, /* void */, "%zu", (size_t)a.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_FLOORDIV_INT)
     // executorch_prim::floordiv.int(int, int) -> int
     Kernel(
         "executorch_prim::floordiv.int",
@@ -422,7 +488,10 @@ static Kernel prim_ops[] = {
           EValue& out = *stack[2];
           out = EValue(a.toInt() / b.toInt());
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_MOD_INT)
     // executorch_prim::mod.int(int, int) -> int
     Kernel(
         "executorch_prim::mod.int",
@@ -440,7 +509,10 @@ static Kernel prim_ops[] = {
           EValue& out = *stack[2];
           out = EValue(a.toInt() % b.toInt());
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_MOD_SCALAR)
     // executorch_prim::mod.Scalar(Scalar, Scalar) -> Scalar
     Kernel(
         "executorch_prim::mod.Scalar",
@@ -469,7 +541,10 @@ static Kernel prim_ops[] = {
                 (size_t)b.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_CEIL_SCALAR)
     // ceil.Scalar(Scalar a) -> Scalar
     Kernel(
         "executorch_prim::ceil.Scalar",
@@ -496,7 +571,10 @@ static Kernel prim_ops[] = {
                 (size_t)a.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_ROUND_SCALAR)
     // round.Scalar(Scalar a) -> Scalar
     Kernel(
         "executorch_prim::round.Scalar",
@@ -540,7 +618,10 @@ static Kernel prim_ops[] = {
                 (size_t)a.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_TRUNC_SCALAR)
     // trunc.Scalar(Scalar a) -> Scalar
     Kernel(
         "executorch_prim::trunc.Scalar",
@@ -562,19 +643,27 @@ static Kernel prim_ops[] = {
                 context, false, InvalidType, /* void */, "%zu", (size_t)a.tag);
           }
         }),
+#endif
 
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_ET_COPY_INDEX_TENSOR)
     // executorch_prim::et_copy_index.tensor(tensor, tensor) -> tensor
     Kernel(
         "executorch_prim::et_copy_index.tensor",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           et_copy_index(context, stack);
         }),
+#endif
+
+#if !defined(EXECUTORCH_ENABLE_PRIM_OPS_SELECTIVE_BUILD) || \
+    defined(INCLUDE_EXECUTORCH_PRIM_ET_VIEW_DEFAULT)
     // executorch_prim::et_view.default(Tensor, int[]) -> Tensor
     Kernel(
         "executorch_prim::et_view.default",
         [](KernelRuntimeContext& context, Span<EValue*> stack) {
           et_view(context, stack);
         }),
+#endif
 
 };
 
diff --git a/kernels/prim_ops/selective_build_prim_ops.h b/kernels/prim_ops/selective_build_prim_ops.h
new file mode 100644
index 00000000000..78181405b11
--- /dev/null
+++ b/kernels/prim_ops/selective_build_prim_ops.h
@@ -0,0 +1,12 @@
+#pragma once
+/**
+ * Generated by executorch/kernels/prim_ops/selective_build_prim_ops.h
+ * This header conditionally includes selected_prim_ops.h when selective build
+ * for prim ops is enabled.
+ */
+
+// If no prim ops are selected, then the header is empty.
+// that would mean all prim ops are enabled.
+#ifdef ET_PRIM_OPS_SELECTIVE_BUILD
+#include "selected_prim_ops.h"
+#endif
diff --git a/kernels/prim_ops/targets.bzl b/kernels/prim_ops/targets.bzl
index 8bdc44fe553..eea66c1afa7 100644
--- a/kernels/prim_ops/targets.bzl
+++ b/kernels/prim_ops/targets.bzl
@@ -7,13 +7,31 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
+    # Define the filegroup once outside the loop since it doesn't vary by aten mode
+    runtime.filegroup(
+        name = "prim_ops_sources",
+        srcs = ["register_prim_ops.cpp"],
+        visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
+    )
+
+    runtime.filegroup(
+        name = "selective_build_prim_ops.h",
+        srcs = ["selective_build_prim_ops.h"],
+        visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
+    )
+
     for aten_mode in get_aten_mode_options():
         aten_suffix = ("_aten" if aten_mode else "")
 
         runtime.cxx_library(
             name = "et_copy_index" + aten_suffix,
             srcs = ["et_copy_index.cpp"],
-            visibility = [],  # Private
+            # To allow for selective prim ops to depend on this library.
+            # Used by selective_build.bzl
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
             exported_headers = ["et_copy_index.h"],
             deps = [
                 "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
@@ -28,7 +46,12 @@ def define_common_targets():
         runtime.cxx_library(
             name = "et_view" + aten_suffix,
             srcs = ["et_view.cpp"],
-            visibility = [],  # Private
+            # To allow for selective prim ops to depend on this library.
+            # Used by selective_build.bzl
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
             exported_headers = ["et_view.h"],
             deps = [
                 "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
index 938b49bf58f..1ccb2c27ce5 100644
--- a/kernels/prim_ops/test/prim_ops_test.cpp
+++ b/kernels/prim_ops/test/prim_ops_test.cpp
@@ -434,8 +434,9 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
   EValue* size_wrapped_vals[3] = {
       &size_as_evals[0], &size_as_evals[1], &size_as_evals[2]};
   int64_t size_unwrapped_vals[3] = {0, 0, 0};
-  EValue size_int_list_evalue = EValue(
-      BoxedEvalueList<int64_t>(size_wrapped_vals, size_unwrapped_vals, 3));
+  BoxedEvalueList<int64_t> size_boxed_list(
+      size_wrapped_vals, size_unwrapped_vals, 3);
+  EValue size_int_list_evalue = EValue(&size_boxed_list);
 
   int64_t bad_size1[3] = {-1, 3, -1}; // two inferred dimensions
   EValue bad_size_as_evals1[3] = {
@@ -443,8 +444,9 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
   EValue* bad_size_wrapped_vals1[3] = {
       &bad_size_as_evals1[0], &bad_size_as_evals1[1], &bad_size_as_evals1[2]};
   int64_t bad_size_unwrapped_vals1[3] = {0, 0, 0};
-  EValue bad_size_int_list_evalue1 = EValue(BoxedEvalueList<int64_t>(
-      bad_size_wrapped_vals1, bad_size_unwrapped_vals1, 3));
+  BoxedEvalueList<int64_t> bad_size_boxed_list1(
+      bad_size_wrapped_vals1, bad_size_unwrapped_vals1, 3);
+  EValue bad_size_int_list_evalue1 = EValue(&bad_size_boxed_list1);
 
   int64_t bad_size2[3] = {-2, -3, 1}; // negative size not supported
   EValue bad_size_as_evals2[3] = {
@@ -452,8 +454,9 @@ TEST_F(RegisterPrimOpsTest, TestETView) {
   EValue* bad_size_wrapped_vals2[3] = {
       &bad_size_as_evals2[0], &bad_size_as_evals2[1], &bad_size_as_evals2[2]};
   int64_t bad_size_unwrapped_vals2[3] = {0, 0, 0};
-  EValue bad_size_int_list_evalue2 = EValue(BoxedEvalueList<int64_t>(
-      bad_size_wrapped_vals2, bad_size_unwrapped_vals2, 3));
+  BoxedEvalueList<int64_t> bad_size_boxed_list2(
+      bad_size_wrapped_vals2, bad_size_unwrapped_vals2, 3);
+  EValue bad_size_int_list_evalue2 = EValue(&bad_size_boxed_list2);
 
   // ***************************************************************************
   // Make outs for tests
@@ -525,8 +528,9 @@ TEST_F(RegisterPrimOpsTest, TestETViewDynamic) {
   EValue* size_wrapped_vals[3] = {
       &size_as_evals[0], &size_as_evals[1], &size_as_evals[2]};
   int64_t size_unwrapped_vals[3] = {0, 0, 0};
-  EValue size_int_list_evalue = EValue(
-      BoxedEvalueList<int64_t>(size_wrapped_vals, size_unwrapped_vals, 3));
+  BoxedEvalueList<int64_t> size_boxed_list_2(
+      size_wrapped_vals, size_unwrapped_vals, 3);
+  EValue size_int_list_evalue = EValue(&size_boxed_list_2);
 
 #ifdef USE_ATEN_LIB
   // ATen mode tensors don't need dynamism specification.
@@ -560,8 +564,9 @@ TEST_F(RegisterPrimOpsTest, TestETViewEmpty) {
   EValue* size_wrapped_vals[3] = {
       &size_as_evals[0], &size_as_evals[1], &size_as_evals[2]};
   int64_t size_unwrapped_vals[3] = {0, 0, 0};
-  EValue size_int_list_evalue = EValue(
-      BoxedEvalueList<int64_t>(size_wrapped_vals, size_unwrapped_vals, 3));
+  BoxedEvalueList<int64_t> size_boxed_list_3(
+      size_wrapped_vals, size_unwrapped_vals, 3);
+  EValue size_int_list_evalue = EValue(&size_boxed_list_3);
 
   int64_t bad_size[3] = {0, 1, -1}; // bad size: cannot infer with 0
   EValue bad_size_as_evals[3] = {
@@ -569,8 +574,9 @@ TEST_F(RegisterPrimOpsTest, TestETViewEmpty) {
   EValue* bad_size_wrapped_vals[3] = {
       &bad_size_as_evals[0], &bad_size_as_evals[1], &bad_size_as_evals[2]};
   int64_t bad_size_unwrapped_vals[3] = {0, 0, 0};
-  EValue bad_size_int_list_evalue = EValue(BoxedEvalueList<int64_t>(
-      bad_size_wrapped_vals, bad_size_unwrapped_vals, 3));
+  BoxedEvalueList<int64_t> bad_size_boxed_list(
+      bad_size_wrapped_vals, bad_size_unwrapped_vals, 3);
+  EValue bad_size_int_list_evalue = EValue(&bad_size_boxed_list);
 
   auto out = tf.make({3, 1, 0}, {}, {});
   EValue out_evalue = EValue(out);
@@ -880,8 +886,9 @@ TEST_F(RegisterPrimOpsTest, TestInvalidProgramErrorOnShortStack) {
     EValue* size_wrapped_vals[3] = {
         &size_as_evals[0], &size_as_evals[1], &size_as_evals[2]};
     int64_t size_unwrapped_vals[3] = {0, 0, 0};
-    EValue size_int_list_evalue = EValue(
-        BoxedEvalueList<int64_t>(size_wrapped_vals, size_unwrapped_vals, 3));
+    BoxedEvalueList<int64_t> size_boxed_list_4(
+        size_wrapped_vals, size_unwrapped_vals, 3);
+    EValue size_int_list_evalue = EValue(&size_boxed_list_4);
 
     EValue* stack[2] = {&self_evalue, &size_int_list_evalue};
 
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index b0c837cdefd..d4fc52af76b 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -152,6 +152,7 @@ gen_operators_lib(
 install(
   TARGETS quantized_kernels quantized_ops_lib
   EXPORT ExecuTorchTargets
-  DESTINATION lib
-  PUBLIC_HEADER DESTINATION include/executorch/kernels/quantized/
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  PUBLIC_HEADER
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/kernels/quantized/
 )
diff --git a/kernels/test/op__clone_dim_order_test.cpp b/kernels/test/op__clone_dim_order_test.cpp
index d999897cdf3..f009ce1b195 100644
--- a/kernels/test/op__clone_dim_order_test.cpp
+++ b/kernels/test/op__clone_dim_order_test.cpp
@@ -7,9 +7,6 @@
  */
 
 #include <cstdint>
-#include <map>
-#include <typeindex>
-#include <variant>
 
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator.
 #include <executorch/kernels/test/TestUtil.h>
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index a4e681a7be1..7478f190185 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -177,7 +177,7 @@ def define_common_targets():
 
     _common_op_test("op__to_dim_order_copy_test", ["aten", "portable"])
     _common_op_test("op__empty_dim_order_test", ["aten", "portable"])
-    _common_op_test("op__clone_dim_order_test", ["portable"])
+    _common_op_test("op__clone_dim_order_test", ["aten", "portable"])
     _common_op_test("op_abs_test", ["aten", "portable"])
     _common_op_test("op_acos_test", ["aten", "portable"])
     _common_op_test("op_acosh_test", ["aten", "portable"])
diff --git a/pyproject.toml b/pyproject.toml
index 00cae6de2e7..393f8578c8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,9 +1,10 @@
 [build-system]
 requires = [
   "cmake>=3.29,<4.0.0",  # For building binary targets in the wheel. 4.0.0 breaks third-party CMake build so temporarily pin the version.
+  "packaging>=24.2", # Lower bound required by setuptools
   "pip>=23",  # For building the pip package.
   "pyyaml",  # Imported by the kernel codegen tools.
-  "setuptools>=63",  # For building the pip package contents.
+  "setuptools>=77.0.3",  # For building the pip package contents.
   "wheel",  # For building the pip package archive.
   "zstd",  # Imported by resolve_buck.py.
   "certifi",  # Imported by resolve_buck.py.
@@ -21,7 +22,8 @@ readme = "README-wheel.md"
 authors = [
   {name="PyTorch Team", email="packages@pytorch.org"},
 ]
-license = {file = "LICENSE"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
 keywords = ["pytorch", "machine learning"]
 # PyPI package information.
 classifiers = [
@@ -29,11 +31,10 @@ classifiers = [
     #   3 - Alpha
     #   4 - Beta
     #   5 - Production/Stable
-    "Development Status :: 4 - Beta",
+    "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "Intended Audience :: Education",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: BSD License",
     "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Mathematics",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
@@ -64,6 +65,7 @@ dependencies=[
   "pytest",
   "pytest-xdist",
   "pytest-rerunfailures==15.1",
+  "pytest-json-report",
   "pyyaml",
   "ruamel.yaml",
   "sympy",
diff --git a/pytest.ini b/pytest.ini
index 100c47aed50..7c722d50e29 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -31,6 +31,7 @@ addopts =
 
     # codegen
     codegen/test
+    codegen/tools/test/test_tools_selective_build.py
 
     # devtools
     devtools/
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9df5e7b93ed..258a898894c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,10 @@
 # Pip packages needed to build from source. Mainly for development of ExecuTorch.
 
 cmake>=3.29, <4.0.0  # For building binary targets in the wheel.
+packaging>=24.2 # Lower bound required by setuptools
 pip>=23  # For building the pip package.
 pyyaml  # Imported by the kernel codegen tools.
-setuptools>=63  # For building the pip package contents.
+setuptools>=77.0.3  # For building the pip package contents.
 wheel  # For building the pip package archive.
 zstd  # Imported by resolve_buck.py.
 certifi  # Imported by resolve_buck.py.
diff --git a/runtime/core/evalue.h b/runtime/core/evalue.h
index 6f1cc5f06db..0cea86dc30c 100644
--- a/runtime/core/evalue.h
+++ b/runtime/core/evalue.h
@@ -94,15 +94,14 @@ struct EValue {
       int64_t as_int;
       double as_double;
       bool as_bool;
-      // TODO(jakeszwe): convert back to pointers to optimize size of this
-      // struct
-      executorch::aten::ArrayRef<char> as_string;
-      executorch::aten::ArrayRef<double> as_double_list;
-      executorch::aten::ArrayRef<bool> as_bool_list;
-      BoxedEvalueList<int64_t> as_int_list;
-      BoxedEvalueList<executorch::aten::Tensor> as_tensor_list;
-      BoxedEvalueList<std::optional<executorch::aten::Tensor>>
-          as_list_optional_tensor;
+
+      executorch::aten::ArrayRef<char>* as_string_ptr;
+      executorch::aten::ArrayRef<double>* as_double_list_ptr;
+      executorch::aten::ArrayRef<bool>* as_bool_list_ptr;
+      BoxedEvalueList<int64_t>* as_int_list_ptr;
+      BoxedEvalueList<executorch::aten::Tensor>* as_tensor_list_ptr;
+      BoxedEvalueList<std::optional<executorch::aten::Tensor>>*
+          as_list_optional_tensor_ptr;
     } copyable_union;
 
     // Since a Tensor just holds a TensorImpl*, there's no value to use Tensor*
@@ -280,9 +279,8 @@ struct EValue {
   }
 
   /****** String Type ******/
-  /*implicit*/ EValue(const char* s, size_t size) : tag(Tag::String) {
-    payload.copyable_union.as_string =
-        executorch::aten::ArrayRef<char>(s, size);
+  /*implicit*/ EValue(executorch::aten::ArrayRef<char>* s) : tag(Tag::String) {
+    payload.copyable_union.as_string_ptr = s;
   }
 
   bool isString() const {
@@ -292,13 +290,13 @@ struct EValue {
   std::string_view toString() const {
     ET_CHECK_MSG(isString(), "EValue is not a String.");
     return std::string_view(
-        payload.copyable_union.as_string.data(),
-        payload.copyable_union.as_string.size());
+        payload.copyable_union.as_string_ptr->data(),
+        payload.copyable_union.as_string_ptr->size());
   }
 
   /****** Int List Type ******/
-  /*implicit*/ EValue(BoxedEvalueList<int64_t> i) : tag(Tag::ListInt) {
-    payload.copyable_union.as_int_list = i;
+  /*implicit*/ EValue(BoxedEvalueList<int64_t>* i) : tag(Tag::ListInt) {
+    payload.copyable_union.as_int_list_ptr = i;
   }
 
   bool isIntList() const {
@@ -307,12 +305,13 @@ struct EValue {
 
   executorch::aten::ArrayRef<int64_t> toIntList() const {
     ET_CHECK_MSG(isIntList(), "EValue is not an Int List.");
-    return payload.copyable_union.as_int_list.get();
+    return (payload.copyable_union.as_int_list_ptr)->get();
   }
 
   /****** Bool List Type ******/
-  /*implicit*/ EValue(executorch::aten::ArrayRef<bool> b) : tag(Tag::ListBool) {
-    payload.copyable_union.as_bool_list = b;
+  /*implicit*/ EValue(executorch::aten::ArrayRef<bool>* b)
+      : tag(Tag::ListBool) {
+    payload.copyable_union.as_bool_list_ptr = b;
   }
 
   bool isBoolList() const {
@@ -321,13 +320,13 @@ struct EValue {
 
   executorch::aten::ArrayRef<bool> toBoolList() const {
     ET_CHECK_MSG(isBoolList(), "EValue is not a Bool List.");
-    return payload.copyable_union.as_bool_list;
+    return *(payload.copyable_union.as_bool_list_ptr);
   }
 
   /****** Double List Type ******/
-  /*implicit*/ EValue(executorch::aten::ArrayRef<double> d)
+  /*implicit*/ EValue(executorch::aten::ArrayRef<double>* d)
       : tag(Tag::ListDouble) {
-    payload.copyable_union.as_double_list = d;
+    payload.copyable_union.as_double_list_ptr = d;
   }
 
   bool isDoubleList() const {
@@ -336,13 +335,13 @@ struct EValue {
 
   executorch::aten::ArrayRef<double> toDoubleList() const {
     ET_CHECK_MSG(isDoubleList(), "EValue is not a Double List.");
-    return payload.copyable_union.as_double_list;
+    return *(payload.copyable_union.as_double_list_ptr);
   }
 
   /****** Tensor List Type ******/
-  /*implicit*/ EValue(BoxedEvalueList<executorch::aten::Tensor> t)
+  /*implicit*/ EValue(BoxedEvalueList<executorch::aten::Tensor>* t)
       : tag(Tag::ListTensor) {
-    payload.copyable_union.as_tensor_list = t;
+    payload.copyable_union.as_tensor_list_ptr = t;
   }
 
   bool isTensorList() const {
@@ -351,14 +350,14 @@ struct EValue {
 
   executorch::aten::ArrayRef<executorch::aten::Tensor> toTensorList() const {
     ET_CHECK_MSG(isTensorList(), "EValue is not a Tensor List.");
-    return payload.copyable_union.as_tensor_list.get();
+    return payload.copyable_union.as_tensor_list_ptr->get();
   }
 
   /****** List Optional Tensor Type ******/
   /*implicit*/ EValue(
-      BoxedEvalueList<std::optional<executorch::aten::Tensor>> t)
+      BoxedEvalueList<std::optional<executorch::aten::Tensor>>* t)
       : tag(Tag::ListOptionalTensor) {
-    payload.copyable_union.as_list_optional_tensor = t;
+    payload.copyable_union.as_list_optional_tensor_ptr = t;
   }
 
   bool isListOptionalTensor() const {
@@ -367,7 +366,7 @@ struct EValue {
 
   executorch::aten::ArrayRef<std::optional<executorch::aten::Tensor>>
   toListOptionalTensor() const {
-    return payload.copyable_union.as_list_optional_tensor.get();
+    return payload.copyable_union.as_list_optional_tensor_ptr->get();
   }
 
   /****** ScalarType Type ******/
diff --git a/runtime/core/exec_aten/testing_util/targets.bzl b/runtime/core/exec_aten/testing_util/targets.bzl
index 6198a09c2cb..e5ebdc8788a 100644
--- a/runtime/core/exec_aten/testing_util/targets.bzl
+++ b/runtime/core/exec_aten/testing_util/targets.bzl
@@ -33,7 +33,6 @@ def define_common_targets():
                 "//executorch/kernels/fb/custom_ops/...",
                 "//executorch/runtime/core/test/...",
                 "//executorch/test/...",
-                "//executorch/backends/fb/qnnpack/test/...",
                 "//executorch/extension/kernel_util/test/...",
                 "@EXECUTORCH_CLIENTS",
             ],
diff --git a/runtime/core/freeable_buffer.h b/runtime/core/freeable_buffer.h
index a90c899103d..c743f32116a 100644
--- a/runtime/core/freeable_buffer.h
+++ b/runtime/core/freeable_buffer.h
@@ -9,6 +9,12 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
+#include <variant>
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/assert.h>
 
 namespace executorch {
 namespace runtime {
@@ -20,20 +26,35 @@ class FreeableBuffer final {
  public:
   // Callback signature for the function that does the freeing.
   using FreeFn = void (*)(void* context, void* data, size_t size);
+  using FreeUInt64Fn =
+      void (*)(void* context, uint64_t data_uint64, size_t size);
+
+ private:
+  // Forward declare types.
+  struct PointerData {
+    const void* data_;
+    FreeFn free_fn_;
+  };
 
+  struct UInt64Data {
+    // A pointer value cast to uint64_t.
+    uint64_t data_;
+    FreeUInt64Fn free_fn_;
+  };
+
+ public:
   /**
    * Creates an empty FreeableBuffer with size zero and a null data pointer.
    */
   FreeableBuffer()
-      : free_fn_(nullptr),
+      : data_(PointerData{nullptr, nullptr}),
         free_fn_context_(nullptr),
-        data_(nullptr),
         size_(0) {}
 
   /**
    * Creates a FreeableBuffer with an optional free function.
    *
-   * @param[in] data The data of the segment.
+   * @param[in] data The data of the segment, as a void*.
    * @param[in] size The size of the segment data, in bytes.
    * @param[in] free_fn Optional function to free the data. Guaranteed to be
    *     called exactly once before the FreeableBuffer is destroyed. May be
@@ -47,9 +68,35 @@ class FreeableBuffer final {
       size_t size,
       FreeFn free_fn,
       void* free_fn_context = nullptr)
-      : free_fn_(free_fn),
+      : data_(PointerData{data, free_fn}),
+        free_fn_context_(free_fn_context),
+        size_(size) {}
+
+  /**
+   * Creates a FreeableBuffer with an optional free function.
+   *
+   * NOTE: most users should use the other ctor with FreeFn.
+   * This variant exists for situations where the FreeableBuffer points to
+   * memory on a different core whose pointer value is larger than the local
+   * core's void*.
+   *
+   * @param[in] data Pointer to the data of the segment, cast to a uint64_t
+   * value.
+   * @param[in] size The size of the segment data, in bytes.
+   * @param[in] free_fn Optional function to free the data. Guaranteed to be
+   *     called exactly once before the FreeableBuffer is destroyed. May be
+   *     nullptr. NOTE: This function must be thread-safe. If it modifies common
+   *     state, the function must do its own locking.
+   * @param[in] free_fn_context Opaque pointer to pass as the `context`
+   *     parameter of `free_fn`. May be nullptr.
+   */
+  explicit FreeableBuffer(
+      const uint64_t data_uint64,
+      size_t size,
+      FreeUInt64Fn free_fn,
+      void* free_fn_context = nullptr)
+      : data_(UInt64Data{data_uint64, free_fn}),
         free_fn_context_(free_fn_context),
-        data_(data),
         size_(size) {}
 
   /**
@@ -57,13 +104,15 @@ class FreeableBuffer final {
    * leaving `rhs` pointing to nullptr.
    */
   FreeableBuffer(FreeableBuffer&& rhs) noexcept
-      : free_fn_(rhs.free_fn_),
+      : data_(rhs.data_),
         free_fn_context_(rhs.free_fn_context_),
-        data_(rhs.data_),
         size_(rhs.size_) {
-    rhs.free_fn_ = nullptr;
+    if (std::holds_alternative<PointerData>(rhs.data_)) {
+      rhs.data_ = PointerData{nullptr, nullptr};
+    } else {
+      rhs.data_ = UInt64Data{0, nullptr};
+    }
     rhs.free_fn_context_ = nullptr;
-    rhs.data_ = nullptr;
     rhs.size_ = 0;
   }
 
@@ -75,11 +124,22 @@ class FreeableBuffer final {
    * Frees the data if not already free. Safe to call multiple times.
    */
   void Free() {
-    if (data_ != nullptr) {
-      if (free_fn_ != nullptr) {
-        free_fn_(free_fn_context_, const_cast<void*>(data_), size_);
+    if (std::holds_alternative<PointerData>(data_)) {
+      PointerData& ptr_data = std::get<PointerData>(data_);
+      if (ptr_data.data_ != nullptr && ptr_data.free_fn_ != nullptr) {
+        // Do not need to check for truncation here, as free_fn_ is only set
+        // using the void* ctor.
+        ptr_data.free_fn_(
+            free_fn_context_, const_cast<void*>(ptr_data.data_), size_);
       }
-      data_ = nullptr;
+      ptr_data.data_ = nullptr;
+      size_ = 0;
+    } else {
+      UInt64Data& int64_data = std::get<UInt64Data>(data_);
+      if (int64_data.data_ != 0 && int64_data.free_fn_ != nullptr) {
+        int64_data.free_fn_(free_fn_context_, int64_data.data_, size_);
+      }
+      int64_data.data_ = static_cast<uint64_t>(0);
       size_ = 0;
     }
   }
@@ -95,7 +155,37 @@ class FreeableBuffer final {
    * Pointer to the data. Returns nullptr if the data has been freed.
    */
   const void* data() const {
-    return data_;
+    ET_CHECK_MSG(
+        std::holds_alternative<PointerData>(data_),
+        "FreeableBuffer is backed by an uint64_t, please use the data_uint64_type() API.");
+    return std::get<PointerData>(data_).data_;
+  }
+
+  /**
+   * Pointer to the data. Returns nullptr if the data has been freed.
+   * Safe version of data() API that returns an ERror if the data is
+   * backed by int64_t instead of void*.
+   */
+  Result<const void*> data_safe() const {
+    ET_CHECK_OR_RETURN_ERROR(
+        std::holds_alternative<PointerData>(data_),
+        InvalidType,
+        "FreeableBuffer is backed by an uint64_t, please use the data_uint64_type() API.");
+    return std::get<PointerData>(data_).data_;
+  }
+
+  /**
+   * Data address as a uint64_t. Returns zero if the data has been freed.
+   * Most users should use data(). data_uint64_type() is only helpful in
+   * situations where the FreeableBuffer points to memory on a different core
+   * whose pointer value is larger than the local core's void *.
+   */
+  Result<uint64_t> data_uint64_type() const {
+    ET_CHECK_OR_RETURN_ERROR(
+        std::holds_alternative<UInt64Data>(data_),
+        InvalidType,
+        "FreeableBuffer is backed by a void*, please use the data() API.");
+    return std::get<UInt64Data>(data_).data_;
   }
 
  private:
@@ -104,9 +194,15 @@ class FreeableBuffer final {
   FreeableBuffer& operator=(FreeableBuffer&& rhs) noexcept = delete;
   FreeableBuffer& operator=(const FreeableBuffer& rhs) = delete;
 
-  FreeFn free_fn_;
+  // This stores either a PointerData or a UInt64Data structure. Most users
+  // should use the PointerData variant and the void* ctor. This creates a
+  // FreeableBuffer backed by void*, accessed using the void* getter data().
+  // The UInt64Data variant is only helpful in situations where the
+  // FreeableBuffer points to memory on a different core whose pointer value
+  // is larger than the local core's void*.
+  std::variant<PointerData, UInt64Data> data_;
+
   void* free_fn_context_;
-  const void* data_;
   size_t size_;
 };
 
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index dcda0f5c7dc..6b61f45d7d4 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -78,11 +78,11 @@ def define_common_targets():
             ] if not runtime.is_oss else [],
         }),
         xplat_exported_deps = [
-            "//xplat/caffe2:aten_header",
-            "//xplat/caffe2/c10:c10_headers",
+            "fbsource//xplat/caffe2:aten_header",
+            "fbsource//xplat/caffe2/c10:c10_headers",
         ] + select({
-            "DEFAULT": ["//xplat/caffe2:generated_aten_config_header"],
-            "ovr_config//build_mode:arvr_mode": ["//xplat/caffe2:ovrsource_aten_Config.h"],
+            "DEFAULT": ["fbsource//xplat/caffe2:generated_aten_config_header"],
+            "ovr_config//build_mode:arvr_mode": ["fbsource//xplat/caffe2:ovrsource_aten_Config.h"],
         }) + get_sleef_deps(),
         fbcode_exported_deps = ([
             "//caffe2:aten-headers-cpu",
diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
index 558edb175ae..e340e7626a0 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h
@@ -359,6 +359,7 @@ static inline int C10_WARP_SIZE_INTERNAL() {
 // Those platforms do not support assert()
 #define CUDA_KERNEL_ASSERT(cond)
 #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)
 #define SYCL_KERNEL_ASSERT(cond)
 #elif defined(_MSC_VER)
 #if defined(NDEBUG)
@@ -396,6 +397,26 @@ __host__ __device__
                static_cast<unsigned>(__LINE__)), \
            0);                                   \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                     \
+  if (C10_UNLIKELY(!(cond))) {                                        \
+    (void)(printf(                                                    \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(           \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: " \
+                      "Assertion failed: `" #cond "`: " msg "\n",     \
+        __func__,                                                     \
+        blockIdx.x,                                                   \
+        blockIdx.y,                                                   \
+        blockIdx.z,                                                   \
+        threadIdx.x,                                                  \
+        threadIdx.y,                                                  \
+        threadIdx.z,                                                  \
+        ##__VA_ARGS__));                                              \
+    (void)(_wassert(                                                  \
+               _CRT_WIDE(#cond),                                      \
+               _CRT_WIDE(__FILE__),                                   \
+               static_cast<unsigned>(__LINE__)),                      \
+           0);                                                        \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                 \
   if (C10_UNLIKELY(!(cond))) {                   \
     (void)(_wassert(                             \
@@ -455,6 +476,10 @@ __host__ __device__
   if C10_UNLIKELY (!(cond)) {             \
     abort();                              \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...) \
+  if C10_UNLIKELY (!(cond)) {                     \
+    abort();                                      \
+  }
 #define SYCL_KERNEL_ASSERT(cond) \
   if C10_UNLIKELY (!(cond)) {    \
     abort();                     \
@@ -470,6 +495,23 @@ __host__ __device__
     __assert_fail(                                                     \
         msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
   }
+#define CUDA_KERNEL_ASSERT_PRINTF(cond, msg, ...)                        \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    printf(                                                            \
+        "[CUDA_KERNEL_ASSERT] " __FILE__ ":" C10_STRINGIZE(            \
+            __LINE__) ": %s: block: [%d,%d,%d], thread: [%d,%d,%d]: "  \
+            "Assertion failed: `" #cond "`: " msg "\n",                \
+        __func__,                                                      \
+        blockIdx.x,                                                    \
+        blockIdx.y,                                                    \
+        blockIdx.z,                                                    \
+        threadIdx.x,                                                   \
+        threadIdx.y,                                                   \
+        threadIdx.z,                                                   \
+        ##__VA_ARGS__); \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
 #define SYCL_KERNEL_ASSERT(cond)                                         \
   if (C10_UNLIKELY(!(cond))) {                                           \
     __assert_fail(                                                       \
diff --git a/runtime/core/test/evalue_test.cpp b/runtime/core/test/evalue_test.cpp
index 06cdc40ad98..f04745187bb 100644
--- a/runtime/core/test/evalue_test.cpp
+++ b/runtime/core/test/evalue_test.cpp
@@ -166,7 +166,9 @@ TEST_F(EValueTest, ToScalarType) {
 }
 
 TEST_F(EValueTest, toString) {
-  const EValue e("foo", 3);
+  auto string_ref =
+      std::make_unique<executorch::aten::ArrayRef<char>>("foo", 3);
+  const EValue e(string_ref.get());
   EXPECT_TRUE(e.isString());
   EXPECT_FALSE(e.isNone());
 
@@ -218,11 +220,12 @@ TEST_F(EValueTest, toOptionalTensorList) {
   EValue* values_p[2] = {&values[0], &values[1]};
   std::optional<executorch::aten::Tensor> storage[2];
   // wrap in array ref
-  BoxedEvalueList<std::optional<executorch::aten::Tensor>> a(
+  auto boxed_list = std::make_unique<
+      BoxedEvalueList<std::optional<executorch::aten::Tensor>>>(
       values_p, storage, 2);
 
   // create Evalue
-  EValue e(a);
+  EValue e(boxed_list.get());
   e.tag = Tag::ListOptionalTensor;
   EXPECT_TRUE(e.isListOptionalTensor());
 
diff --git a/runtime/core/test/freeable_buffer_test.cpp b/runtime/core/test/freeable_buffer_test.cpp
index e2edff24227..2848a6b049d 100644
--- a/runtime/core/test/freeable_buffer_test.cpp
+++ b/runtime/core/test/freeable_buffer_test.cpp
@@ -6,16 +6,21 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/freeable_buffer.h>
+#include <executorch/runtime/platform/platform.h>
+#include <executorch/test/utils/DeathTest.h>
 
 #include <gtest/gtest.h>
 
 using namespace ::testing;
+
+using executorch::runtime::Error;
 using executorch::runtime::FreeableBuffer;
 
 struct FreeCallArgs {
   size_t calls;
-  void* data;
+  std::variant<const void*, uint64_t> data;
   size_t size;
 };
 
@@ -26,9 +31,18 @@ void RecordFree(void* context, void* data, size_t size) {
   call->size = size;
 }
 
+void RecordInt64Free(void* context, uint64_t data, size_t size) {
+  auto* call = reinterpret_cast<FreeCallArgs*>(context);
+  call->calls++;
+  call->data = data;
+  call->size = size;
+}
+
 TEST(FreeableBufferTest, EmptyTest) {
   FreeableBuffer fb;
   EXPECT_EQ(fb.data(), nullptr);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), nullptr);
   EXPECT_EQ(fb.size(), 0);
 }
 
@@ -42,11 +56,33 @@ TEST(FreeableBufferTest, DataAndSizeTest) {
   // It should return the ctor params unmodified.
   EXPECT_EQ(fb.size(), sizeof(i));
   EXPECT_EQ(fb.data(), &i);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), &i);
 
   // Freeing should clear them, even though free_fn is nullptr.
   fb.Free();
   EXPECT_EQ(fb.size(), 0);
   EXPECT_EQ(fb.data(), nullptr);
+  EXPECT_EQ(fb.data_safe().error(), Error::Ok);
+  EXPECT_EQ(fb.data_safe().get(), nullptr);
+
+  // Use uint64_t constructor.
+  const uint64_t i64 = 1;
+  FreeableBuffer fb2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/nullptr);
+
+  // It should return the ctor params unmodified.
+  EXPECT_EQ(fb2.size(), sizeof(i64));
+  EXPECT_EQ(fb2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb2.data_uint64_type().get(), i64);
+
+  // Freeing should clear them, even though free_fn is nullptr.
+  fb2.Free();
+  EXPECT_EQ(fb2.size(), 0);
+  EXPECT_EQ(fb2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb2.data_uint64_type().get(), 0);
 }
 
 TEST(FreeableBufferTest, FreeTest) {
@@ -68,7 +104,7 @@ TEST(FreeableBufferTest, FreeTest) {
     // Called once during Free() with the expected data/size.
     fb.Free();
     EXPECT_EQ(call.calls, 1);
-    EXPECT_EQ(call.data, &i);
+    EXPECT_EQ(std::get<const void*>(call.data), &i);
     EXPECT_EQ(call.size, sizeof(i));
 
     // A second call to Free() should not call the function again.
@@ -78,6 +114,31 @@ TEST(FreeableBufferTest, FreeTest) {
 
   // The destructor should not have called the function again.
   EXPECT_EQ(call.calls, 1);
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  {
+    uint64_t i64 = 1;
+    FreeableBuffer fb(
+        /*data_uint64=*/i64,
+        /*size=*/sizeof(i64),
+        /*free_fn=*/RecordInt64Free,
+        /*free_fn_context=*/&call2);
+
+    // Not called during construction.
+    EXPECT_EQ(call2.calls, 0);
+
+    // Called once during Free() with the expected data/size.
+    fb.Free();
+    EXPECT_EQ(call2.calls, 1);
+    EXPECT_EQ(std::get<uint64_t>(call2.data), i64);
+    EXPECT_EQ(call2.size, sizeof(i64));
+
+    // A second call to Free() should not call the function again.
+    fb.Free();
+    EXPECT_EQ(call2.calls, 1);
+  }
+  EXPECT_EQ(call2.calls, 1);
 }
 
 TEST(FreeableBufferTest, DestructorTest) {
@@ -99,8 +160,24 @@ TEST(FreeableBufferTest, DestructorTest) {
 
   // The destructor should have freed the data.
   EXPECT_EQ(call.calls, 1);
-  EXPECT_EQ(call.data, &i);
+  EXPECT_EQ(std::get<const void*>(call.data), &i);
   EXPECT_EQ(call.size, sizeof(i));
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  uint64_t i64 = 1;
+  {
+    FreeableBuffer fb2(
+        /*data_uint64=*/i64,
+        /*size=*/sizeof(i),
+        /*free_fn=*/RecordInt64Free,
+        /*free_fn_context=*/&call2);
+    EXPECT_EQ(call2.calls, 0);
+  }
+  // The destructor should have freed the data.
+  EXPECT_EQ(call2.calls, 1);
+  EXPECT_EQ(std::get<uint64_t>(call2.data), i64);
+  EXPECT_EQ(call2.size, sizeof(i));
 }
 
 TEST(FreeableBufferTest, MoveTest) {
@@ -127,7 +204,6 @@ TEST(FreeableBufferTest, MoveTest) {
   // The destination FreeableBuffer should have the data.
   EXPECT_EQ(fb_dst.size(), sizeof(i));
   EXPECT_EQ(fb_dst.data(), &i);
-
   // Freeing the source FreeableBuffer should not call the free function.
   fb_src.Free();
   EXPECT_EQ(call.calls, 0);
@@ -135,6 +211,59 @@ TEST(FreeableBufferTest, MoveTest) {
   // Freeing the destination FreeableBuffer should call the free function.
   fb_dst.Free();
   EXPECT_EQ(call.calls, 1);
-  EXPECT_EQ(call.data, &i);
   EXPECT_EQ(call.size, sizeof(i));
+
+  // Test with uint64_t constructor and free function.
+  FreeCallArgs call2 = {};
+  const uint64_t i64 = 1;
+  FreeableBuffer fb_src2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/RecordInt64Free,
+      /*free_fn_context=*/&call2);
+  EXPECT_EQ(fb_src2.size(), sizeof(i64));
+  EXPECT_EQ(fb_src2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb_src2.data_uint64_type().get(), i64);
+
+  // Move it into a second FreeableBuffer.
+  FreeableBuffer fb_dst2(std::move(fb_src2));
+
+  // The source FreeableBuffer should now be empty.
+  EXPECT_EQ(fb_src2.size(), 0); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(
+      fb_src2.data_uint64_type().error(),
+      Error::Ok); // NOLINT(bugprone-use-after-move)
+  EXPECT_EQ(
+      fb_src2.data_uint64_type().get(), 0); // NOLINT(bugprone-use-after-move)
+
+  // The destination FreeableBuffer should have the data.
+  EXPECT_EQ(fb_dst2.size(), sizeof(i64));
+  EXPECT_EQ(fb_dst2.data_uint64_type().error(), Error::Ok);
+  EXPECT_EQ(fb_dst2.data_uint64_type().get(), i64);
+  // Freeing the source FreeableBuffer should not call the free function.
+  fb_src2.Free();
+  EXPECT_EQ(call2.calls, 0);
+
+  // Freeing the destination FreeableBuffer should call the free function.
+  fb_dst2.Free();
+  EXPECT_EQ(call2.calls, 1);
+  EXPECT_EQ(call2.size, sizeof(i64));
+}
+
+TEST(FreeableBufferTest, APIMisuseDeathTest) {
+  executorch::runtime::pal_init();
+  int i;
+  FreeableBuffer fb(
+      /*data=*/&i,
+      /*size=*/sizeof(i),
+      /*free_fn=*/nullptr);
+  EXPECT_EQ(fb.data_uint64_type().error(), Error::InvalidType);
+
+  uint64_t i64 = 1;
+  FreeableBuffer fb2(
+      /*data_uint64=*/i64,
+      /*size=*/sizeof(i64),
+      /*free_fn=*/nullptr);
+  EXPECT_EQ(fb2.data_safe().error(), Error::InvalidType);
+  ET_EXPECT_DEATH(fb2.data(), ".*");
 }
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 65a47594c8d..ccb88a03818 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -507,8 +507,12 @@ Error Method::parse_values(const NamedDataMap* external_data_map) {
               j);
           evalp_list[j] = &values_[static_cast<size_t>(value_index)];
         }
-        new (&values_[i]) EValue(
-            BoxedEvalueList<int64_t>(evalp_list, int_list, items->size()));
+        auto* boxed_list_mem =
+            memory_manager_->method_allocator()
+                ->allocateInstance<BoxedEvalueList<int64_t>>();
+        auto boxed_list = new (boxed_list_mem)
+            BoxedEvalueList<int64_t>(evalp_list, int_list, items->size());
+        new (&values_[i]) EValue(boxed_list);
       } break;
       case executorch_flatbuffer::KernelTypes::BoolList: {
         const auto items =
@@ -525,8 +529,12 @@ Error Method::parse_values(const NamedDataMap* external_data_map) {
         // portable here we need to allocate a new array of bool and copy cast
         // the flatbuffer data into it, but because of how exceptionally rare
         // this case is its low prio TODO: jakeszwe
-        new (&values_[i]) EValue(executorch::aten::ArrayRef<bool>(
-            (const bool*)items->data(), items->size()));
+        auto* bool_list_mem =
+            memory_manager_->method_allocator()
+                ->allocateInstance<executorch::aten::ArrayRef<bool>>();
+        auto bool_list = new (bool_list_mem) executorch::aten::ArrayRef<bool>(
+            (const bool*)items->data(), items->size());
+        new (&values_[i]) EValue(bool_list);
       } break;
       case executorch_flatbuffer::KernelTypes::DoubleList: {
         const auto items =
@@ -536,8 +544,12 @@ Error Method::parse_values(const NamedDataMap* external_data_map) {
             InvalidProgram,
             "Missing list at index %" ET_PRIsize_t,
             i);
-        new (&values_[i]) EValue(
-            executorch::aten::ArrayRef<double>(items->data(), items->size()));
+        auto* double_list_mem =
+            memory_manager_->method_allocator()
+                ->allocateInstance<executorch::aten::ArrayRef<double>>();
+        auto double_list = new (double_list_mem)
+            executorch::aten::ArrayRef<double>(items->data(), items->size());
+        new (&values_[i]) EValue(double_list);
       } break;
       case executorch_flatbuffer::KernelTypes::String: {
         const auto fb_str =
@@ -548,7 +560,12 @@ Error Method::parse_values(const NamedDataMap* external_data_map) {
             InvalidProgram,
             "Missing string at index %" ET_PRIsize_t,
             i);
-        new (&values_[i]) EValue(fb_str->c_str(), fb_str->size());
+        auto* char_list_mem =
+            memory_manager_->method_allocator()
+                ->allocateInstance<executorch::aten::ArrayRef<char>>();
+        auto char_list = new (char_list_mem)
+            executorch::aten::ArrayRef<char>(fb_str->c_str(), fb_str->size());
+        new (&values_[i]) EValue(char_list);
       } break;
       case executorch_flatbuffer::KernelTypes::Tensor: {
         auto t = deserialization::parseTensor(
@@ -588,7 +605,12 @@ Error Method::parse_values(const NamedDataMap* external_data_map) {
               static_cast<uint32_t>(tensors.error()));
           return tensors.error();
         }
-        new (&values_[i]) EValue(tensors.get());
+        auto* boxed_tensor_list_mem =
+            memory_manager_->method_allocator()
+                ->allocateInstance<BoxedEvalueList<executorch::aten::Tensor>>();
+        auto boxed_tensor_list = new (boxed_tensor_list_mem)
+            BoxedEvalueList<executorch::aten::Tensor>(std::move(tensors.get()));
+        new (&values_[i]) EValue(boxed_tensor_list);
       } break;
       case executorch_flatbuffer::KernelTypes::OptionalTensorList: {
         const auto items =
@@ -612,7 +634,14 @@ Error Method::parse_values(const NamedDataMap* external_data_map) {
               static_cast<uint32_t>(tensors.error()));
           return tensors.error();
         }
-        new (&values_[i]) EValue(tensors.get());
+        auto* boxed_optional_tensor_list_mem =
+            memory_manager_->method_allocator()
+                ->allocateInstance<
+                    BoxedEvalueList<std::optional<executorch::aten::Tensor>>>();
+        auto boxed_optional_tensor_list = new (boxed_optional_tensor_list_mem)
+            BoxedEvalueList<std::optional<executorch::aten::Tensor>>(
+                std::move(tensors.get()));
+        new (&values_[i]) EValue(boxed_optional_tensor_list);
       } break;
       default:
         // flatbuffer enums start at 0, but they generate a hidden NONE enum
diff --git a/runtime/executor/test/CMakeLists.txt b/runtime/executor/test/CMakeLists.txt
index 05d149ab1b4..5477831923c 100644
--- a/runtime/executor/test/CMakeLists.txt
+++ b/runtime/executor/test/CMakeLists.txt
@@ -191,4 +191,4 @@ target_link_libraries(test_backend_compiler_lib PUBLIC executorch_core)
 
 executorch_target_link_options_shared_lib(test_backend_compiler_lib)
 
-install(TARGETS test_backend_compiler_lib DESTINATION lib)
+install(TARGETS test_backend_compiler_lib DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
index 8a945f19881..92e30016f42 100644
--- a/runtime/kernel/targets.bzl
+++ b/runtime/kernel/targets.bzl
@@ -3,7 +3,10 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_aten_mode_opt
 def _operator_registry_preprocessor_flags():
     max_kernel_num = native.read_config("executorch", "max_kernel_num", None)
     if max_kernel_num != None:
-        return ["-DMAX_KERNEL_NUM=" + max_kernel_num]
+        return select({
+            "DEFAULT": ["-DMAX_KERNEL_NUM=" + max_kernel_num],
+            "ovr_config//build_mode/constraints:arvr_is_host_platform": []
+        })
     elif not runtime.is_oss:
         return select({
             "DEFAULT": [],
diff --git a/runtime/platform/default/android.cpp b/runtime/platform/default/android.cpp
index 5945bf54842..fdaf7db3b1b 100644
--- a/runtime/platform/default/android.cpp
+++ b/runtime/platform/default/android.cpp
@@ -46,7 +46,6 @@
       __android_log_print(                                          \
           ANDROID_LOG_FATAL,                                        \
           "ExecuTorch",                                             \
-          "%s",                                                     \
           "ExecuTorch PAL must be initialized before call to %s()", \
           ET_FUNCTION);                                             \
     }                                                               \
diff --git a/runtime/platform/log.cpp b/runtime/platform/log.cpp
index b338ee10a71..a09987271e7 100644
--- a/runtime/platform/log.cpp
+++ b/runtime/platform/log.cpp
@@ -59,6 +59,38 @@ static_assert(
     kLevelToPal[size_t(LogLevel::Fatal)] == et_pal_log_level_t::kFatal,
     "");
 
+#if ET_LOG_ENABLED
+static size_t get_valid_utf8_prefix_length(const char* bytes, size_t length) {
+  if (!bytes || length == 0) {
+    return 0;
+  }
+  const auto* data = reinterpret_cast<const unsigned char*>(bytes);
+  size_t i = length;
+  while (i > 0 && (data[i - 1] & 0xC0) == 0x80) {
+    --i;
+  }
+  if (i == 0) {
+    return 0;
+  }
+  const size_t lead_pos = i - 1;
+  const unsigned char lead = data[lead_pos];
+  size_t need = 0;
+
+  if (lead < 0x80) {
+    need = 1;
+  } else if ((lead & 0xE0) == 0xC0) {
+    need = 2;
+  } else if ((lead & 0xF0) == 0xE0) {
+    need = 3;
+  } else if ((lead & 0xF8) == 0xF0) {
+    need = 4;
+  } else {
+    return lead_pos;
+  }
+  return length - lead_pos == need ? length : lead_pos;
+}
+#endif // ET_LOG_ENABLED
+
 /**
  * Log a string message.
  *
@@ -84,20 +116,24 @@ void vlogf(
 
   // Maximum length of a log message.
   static constexpr size_t kMaxLogMessageLength = 256;
-  char buf[kMaxLogMessageLength];
-  size_t len = vsnprintf(buf, kMaxLogMessageLength, format, args);
-  if (len >= kMaxLogMessageLength - 1) {
-    buf[kMaxLogMessageLength - 2] = '$';
-    len = kMaxLogMessageLength - 1;
-  }
-  buf[kMaxLogMessageLength - 1] = 0;
+  char buffer[kMaxLogMessageLength];
+
+  const auto write_count =
+      vsnprintf(buffer, kMaxLogMessageLength, format, args);
+  const size_t used_length = (write_count < 0)
+      ? 0
+      : (write_count >= static_cast<int>(kMaxLogMessageLength)
+             ? kMaxLogMessageLength - 1
+             : static_cast<size_t>(write_count));
+  const auto valid_length = get_valid_utf8_prefix_length(buffer, used_length);
+  buffer[valid_length] = '\0';
 
-  et_pal_log_level_t pal_level = (level < LogLevel::NumLevels)
+  const auto pal_level = (level < LogLevel::NumLevels)
       ? kLevelToPal[size_t(level)]
       : et_pal_log_level_t::kUnknown;
 
   pal_emit_log_message(
-      timestamp, pal_level, filename, function, line, buf, len);
+      timestamp, pal_level, filename, function, line, buffer, valid_length);
 
 #endif // ET_LOG_ENABLED
 }
diff --git a/runtime/platform/log.h b/runtime/platform/log.h
index 72ea8528442..7293fa2428d 100644
--- a/runtime/platform/log.h
+++ b/runtime/platform/log.h
@@ -181,6 +181,20 @@ using ::executorch::runtime::LogLevel;
           ##__VA_ARGS__);                                            \
     }                                                                \
   } while (0)
+
+/**
+ * Check a condition and log an error message if the condition is false.
+ *
+ * @param[in] _condition The condition to check.
+ * @param[in] _format Log message format string.
+ */
+#define ET_CHECK_OR_LOG_ERROR(_condition, _format, ...) \
+  do {                                                  \
+    if (!(_condition)) {                                \
+      ET_LOG(Error, _format, ##__VA_ARGS__);            \
+    }                                                   \
+  } while (0)
+
 #else // ET_LOG_ENABLED
 
 /**
@@ -191,4 +205,12 @@ using ::executorch::runtime::LogLevel;
  */
 #define ET_LOG(_level, _format, ...) ((void)0)
 
+/**
+ * Check a condition and log an error message if the condition is false.
+ *
+ * @param[in] _condition The condition to check.
+ * @param[in] _format Log message format string.
+ */
+#define ET_CHECK_OR_LOG_ERROR(_condition, _format, ...) ((void)0)
+
 #endif // ET_LOG_ENABLED
diff --git a/runtime/platform/test/CMakeLists.txt b/runtime/platform/test/CMakeLists.txt
index 901fd0499cd..dd480ee0953 100644
--- a/runtime/platform/test/CMakeLists.txt
+++ b/runtime/platform/test/CMakeLists.txt
@@ -33,7 +33,14 @@ et_cxx_test(
 #
 # et_cxx_test(platform_death_test SOURCES executor_pal_death_test.cpp)
 
-et_cxx_test(logging_test SOURCES logging_test.cpp)
+# No weak function symbols on Windows/MSVC, thus PAL intercept doesn't work.
+# Skip logging tests in Release mode.
+if(NOT WIN32 AND NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+  et_cxx_test(logging_test SOURCES logging_test.cpp stub_platform.cpp)
+  set_source_files_properties(
+    logging_test.cpp PROPERTIES COMPILE_DEFINITIONS "ET_MIN_LOG_LEVEL=Debug"
+  )
+endif()
 
 # TODO: Re-enable this test on OSS
 #
diff --git a/runtime/platform/test/logging_test.cpp b/runtime/platform/test/logging_test.cpp
index d44cd2d5e71..3ddc506c062 100644
--- a/runtime/platform/test/logging_test.cpp
+++ b/runtime/platform/test/logging_test.cpp
@@ -10,24 +10,79 @@
 
 #include <executorch/runtime/platform/log.h>
 #include <executorch/runtime/platform/runtime.h>
+#include <executorch/runtime/platform/test/pal_spy.h>
+#include <executorch/runtime/platform/test/stub_platform.h>
 
 using namespace executorch::runtime;
 
-class LoggingTest : public ::testing::Test {
- public:
-  static void SetUpTestSuite() {
-    // Initialize runtime.
-    runtime_init();
-  }
-};
+class LoggingTest : public ::testing::Test {};
 
 TEST_F(LoggingTest, LogLevels) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
   ET_LOG(Debug, "Debug log.");
+  EXPECT_EQ(spy.last_log_message_args.message, "Debug log.");
+
   ET_LOG(Info, "Info log.");
+  EXPECT_EQ(spy.last_log_message_args.message, "Info log.");
+
   ET_LOG(Error, "Error log.");
+  EXPECT_EQ(spy.last_log_message_args.message, "Error log.");
+
   ET_LOG(Fatal, "Fatal log.");
+  EXPECT_EQ(spy.last_log_message_args.message, "Fatal log.");
 }
 
 TEST_F(LoggingTest, LogFormatting) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
   ET_LOG(Info, "Sample log with integer: %u", 100);
+  EXPECT_EQ(spy.last_log_message_args.message, "Sample log with integer: 100");
+}
+
+static std::string get_prefix(std::size_t length, bool use_multibyte) {
+  if (!use_multibyte) {
+    return std::string(length, 'A');
+  }
+  std::ostringstream result;
+  result << std::string(length % 4, 'A');
+  std::size_t remaining = length - (length % 4);
+  while (remaining > 0) {
+    result << "\xF0\x9F\x91\x8D";
+    remaining -= 4;
+  }
+  return result.str();
+}
+
+TEST_F(LoggingTest, Utf8Truncation) {
+  PalSpy spy;
+  InterceptWith iw(spy);
+
+  const char euro[] = "\xE2\x82\xAC";
+  const char thumbs_up[] = "\xF0\x9F\x91\x8D";
+  const char e_acute[] = "\xC3\xA9";
+  const char capital_a_tilde[] = "\xC3\x83";
+
+  struct TruncCase {
+    size_t prefix_length;
+    const char* codepoint;
+  };
+  const TruncCase cases[] = {
+      {253, euro},
+      {252, thumbs_up},
+      {254, e_acute},
+      {254, capital_a_tilde},
+  };
+  for (bool use_multibyte_prefix : {false, true}) {
+    for (const auto& c : cases) {
+      const std::string prefix =
+          get_prefix(c.prefix_length, use_multibyte_prefix);
+      const std::string suffix = "_SHOULD_BE_CUT";
+      ET_LOG(Info, "%s%s%s", prefix.c_str(), c.codepoint, suffix.c_str());
+      EXPECT_EQ(spy.last_log_message_args.message, prefix);
+      EXPECT_EQ(spy.last_log_message_args.length, prefix.size());
+    }
+  }
 }
diff --git a/runtime/platform/test/targets.bzl b/runtime/platform/test/targets.bzl
index 6a46eb29f4b..a5d77ef5a4e 100644
--- a/runtime/platform/test/targets.bzl
+++ b/runtime/platform/test/targets.bzl
@@ -84,6 +84,7 @@ def define_common_targets():
             "logging_test.cpp",
         ],
         deps = [
+            ":stub_platform",
             "//executorch/runtime/platform:platform",
         ],
         compiler_flags = [
diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
index 8ce2d68bab8..63fa4cf4545 100755
--- a/scripts/build_apple_frameworks.sh
+++ b/scripts/build_apple_frameworks.sh
@@ -31,6 +31,7 @@ libextension_apple.a,\
 libextension_data_loader.a,\
 libextension_flat_tensor.a,\
 libextension_module.a,\
+libextension_named_data_map.a,\
 libextension_tensor.a,\
 :${FRAMEWORK_EXECUTORCH_HEADERS_DIR}:${FRAMEWORK_EXECUTORCH_MODULE_NAME}"
 
diff --git a/scripts/build_wasm_tests.sh b/scripts/build_wasm_tests.sh
index 9a09ddd2749..4dd7355e118 100644
--- a/scripts/build_wasm_tests.sh
+++ b/scripts/build_wasm_tests.sh
@@ -22,6 +22,7 @@ emcmake cmake . -DEXECUTORCH_BUILD_WASM=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_SELECT_OPS_LIST="aten::mm.out,aten::add.out" \
     -DEXECUTORCH_BUILD_TESTS=ON \
diff --git a/scripts/pick_doc_commits.py b/scripts/pick_doc_commits.py
index 85958c36977..accec00dda3 100755
--- a/scripts/pick_doc_commits.py
+++ b/scripts/pick_doc_commits.py
@@ -129,7 +129,7 @@ def is_doc_file(path: str) -> bool:
     all_files = frozenset(lines[1:])
     doc_files = frozenset(filter(is_doc_file, all_files))
     non_doc_files = all_files - doc_files
-    is_doc_only = all_files == doc_files
+    is_doc_only = (all_files == doc_files) and len(all_files) > 0
 
     if verbosity > 0 and not is_doc_only:
         debug_log(
diff --git a/scripts/test_ios.sh b/scripts/test_ios.sh
index 8cb86f8f43c..599ae1683a4 100755
--- a/scripts/test_ios.sh
+++ b/scripts/test_ios.sh
@@ -15,7 +15,7 @@ set -e
 
 OUTPUT="${1:-executorch}"
 EXIT_STATUS=0
-APP_PATH="executorch-examples/apple/ExecuTorchDemo/ExecuTorchDemo"
+APP_PATH="executorch-examples/mv3/apple/ExecuTorchDemo/ExecuTorchDemo"
 MODEL_NAME="mv3"
 SIMULATOR_NAME="executorch"
 
diff --git a/setup.py b/setup.py
index def9b996be0..97a1d05096e 100644
--- a/setup.py
+++ b/setup.py
@@ -467,11 +467,10 @@ def run(self):
             # Following code is for building the Qualcomm backend.
             from backends.qualcomm.scripts.download_qnn_sdk import (
                 _download_qnn_sdk,
-                check_glibc_exist_and_validate,
                 is_linux_x86,
             )
 
-            if is_linux_x86() and check_glibc_exist_and_validate():
+            if is_linux_x86():
                 os.environ["EXECUTORCH_BUILDING_WHEEL"] = "1"
 
                 with tempfile.TemporaryDirectory() as tmpdir:
@@ -815,6 +814,9 @@ def run(self):  # noqa C901
             cmake_build_args += ["--target", "portable_lib"]
             cmake_build_args += ["--target", "selective_build"]
 
+        if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"):
+            cmake_build_args += ["--target", "_llm_runner"]
+
         if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_MODULE"):
             cmake_build_args += ["--target", "extension_module"]
 
@@ -884,6 +886,11 @@ def run(self):  # noqa C901
             modpath="executorch.codegen.tools.selective_build",
             dependent_cmake_flags=["EXECUTORCH_BUILD_PYBIND"],
         ),
+        BuiltExtension(
+            src="extension/llm/runner/_llm_runner.*",  # @lint-ignore https://github.com/pytorch/executorch/blob/cb3eba0d7f630bc8cec0a9cc1df8ae2f17af3f7a/scripts/lint_xrefs.sh
+            modpath="executorch.extension.llm.runner._llm_runner",
+            dependent_cmake_flags=["EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"],
+        ),
         BuiltExtension(
             src="executorchcoreml.*",
             src_dir="backends/apple/coreml",
diff --git a/shim_et/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl b/shim_et/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
index 3c6f79c8a95..64ed05df37c 100644
--- a/shim_et/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
+++ b/shim_et/xplat/executorch/backends/xnnpack/third-party/third_party_libs.bzl
@@ -5,12 +5,12 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 # second element is the OSS dep
 _THIRD_PARTY_LIBS = {
     "FP16": ["fbsource//xplat/third-party/FP16:FP16Fbcode", "//backends/xnnpack/third-party:FP16"],
-    "FXdiv": ["//xplat/third-party/FXdiv:FXdiv", "//backends/xnnpack/third-party:FXdiv"],
-    "XNNPACK": ["//xplat/third-party/XNNPACK:XNNPACK", "//backends/xnnpack/third-party:XNNPACK"],
-    "clog": ["//xplat/third-party/clog:clog", "//backends/xnnpack/third-party:clog"],
+    "FXdiv": ["fbsource//xplat/third-party/FXdiv:FXdiv", "//backends/xnnpack/third-party:FXdiv"],
+    "XNNPACK": ["fbsource//xplat/third-party/XNNPACK:XNNPACK", "//backends/xnnpack/third-party:XNNPACK"],
+    "clog": ["fbsource//xplat/third-party/clog:clog", "//backends/xnnpack/third-party:clog"],
     "cpuinfo": ["fbsource//third-party/cpuinfo:cpuinfo", "//backends/xnnpack/third-party:cpuinfo"],
-    "pthreadpool": ["//xplat/third-party/pthreadpool:pthreadpool", "//backends/xnnpack/third-party:pthreadpool"],
-    "pthreadpool_header": ["//xplat/third-party/pthreadpool:pthreadpool_header", "//backends/xnnpack/third-party:pthreadpool_header"],
+    "pthreadpool": ["fbsource//xplat/third-party/pthreadpool:pthreadpool", "//backends/xnnpack/third-party:pthreadpool"],
+    "pthreadpool_header": ["fbsource//xplat/third-party/pthreadpool:pthreadpool_header", "//backends/xnnpack/third-party:pthreadpool_header"],
 }
 
 def third_party_dep(name):
diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl
index 96cffb96e00..8d8893f7454 100644
--- a/shim_et/xplat/executorch/build/build_variables.bzl
+++ b/shim_et/xplat/executorch/build/build_variables.bzl
@@ -341,6 +341,10 @@ EXTENSION_MODULE_SRCS = [
     "extension/module/module.cpp",
 ]
 
+EXTENSION_NAMED_DATA_MAP_SRCS = [
+    "extension/named_data_map/merged_data_map.cpp",
+]
+
 EXTENSION_RUNNER_UTIL_SRCS = [
     "extension/runner_util/inputs.cpp",
     "extension/runner_util/inputs_portable.cpp",
@@ -465,6 +469,7 @@ XNNPACK_BACKEND_BUCK_SRCS = [
     "runtime/XNNHeader.cpp",
     "runtime/XNNPACKBackend.cpp",
     "runtime/XNNWeightsCache.cpp",
+    "runtime/XNNWorkspaceManager.cpp",
     "runtime/profiling/XNNProfiler.cpp",
 ]
 
diff --git a/shim_et/xplat/executorch/build/runtime_wrapper.bzl b/shim_et/xplat/executorch/build/runtime_wrapper.bzl
index 46101ab36db..d03a3f71b3b 100644
--- a/shim_et/xplat/executorch/build/runtime_wrapper.bzl
+++ b/shim_et/xplat/executorch/build/runtime_wrapper.bzl
@@ -346,6 +346,7 @@ def _python_binary(*args, **kwargs):
 
 def _python_test(*args, **kwargs):
     _patch_kwargs_common(kwargs)
+    _remove_caffe2_deps(kwargs)
     env.python_test(*args, **kwargs)
 
 def get_oss_build_kwargs():
diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl
index ae6b42e2d8f..0002884b2a4 100644
--- a/shim_et/xplat/executorch/codegen/codegen.bzl
+++ b/shim_et/xplat/executorch/codegen/codegen.bzl
@@ -1,12 +1,13 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_default_executorch_platforms", "is_xplat", "runtime", "struct_to_json")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
-load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_deps",
     "get_vec_preprocessor_flags",
 )
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "optimized_source_list")
+load("@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl", "portable_source_list")
+load("@fbsource//xplat/executorch/kernels/prim_ops:selective_build.bzl", "prim_ops_registry_selective")
 
 # Headers that declare the function signatures of the C++ functions that
 # map to entries in functions.yaml and custom_ops.yaml.
@@ -81,6 +82,90 @@ ScalarType = enum(
     "Uint64",
 )
 
+def _get_prim_ops_registry_target(name, deps, aten_suffix, platforms):
+    """
+    Helper function to determine which prim ops registry target to use.
+
+    Args:
+        name: Base name for creating selective registry target
+        deps: List of dependencies for the selective registry target, it will filter out
+              the deps with label et_operator_library
+        aten_suffix: Suffix for aten mode (e.g. "_aten")
+        platforms: Platforms configuration
+
+    Returns:
+        String: Target name for the appropriate prim ops registry
+    """
+
+    # If selective build targets are specified, create a selective prim ops registry
+    # Create a selective prim ops registry using the existing function
+    selective_prim_ops_registry_name = name + "_selected_prim_ops_registry"
+    combined_prim_ops_header_target_name = name + "_combined_prim_ops_header"
+    selected_prim_operators_genrule(combined_prim_ops_header_target_name, deps, platforms)
+
+    # Use the existing prim_ops_registry_selective function
+    prim_ops_registry_selective(
+        name = selective_prim_ops_registry_name,
+        selected_prim_ops_header_target = ":" + combined_prim_ops_header_target_name,
+        aten_suffix = aten_suffix,
+        platforms = platforms,
+    )
+
+    # Return the selective registry target
+    return ":" + selective_prim_ops_registry_name
+
+def _extract_prim_ops_from_lists(ops, ops_dict):
+    """
+    Utility function to extract prim ops from ops list and ops_dict.
+
+    Args:
+        ops: List of operator names
+        ops_dict: Dictionary mapping ops to metadata
+
+    Returns:
+        Tuple of (prim_ops, remaining_ops, remaining_ops_dict)
+    """
+
+    def _is_aten_prim_op(op_name):
+        if not op_name.startswith("aten::"):
+            return False
+        for prim_suffix in [
+            "sym_size",
+            "sym_numel",
+            "sym_max",
+            "sym_min",
+            "sym_float",
+        ]:
+            if prim_suffix in op_name:
+                return True
+        return False
+
+    def _is_prim_op(op_name):
+        """Check if an operator is a primitive operation."""
+        return op_name.startswith("executorch_prim::") or (
+            _is_aten_prim_op(op_name)
+        )
+
+    prim_ops = []
+    remaining_ops = []
+    remaining_ops_dict = {}
+
+    # Extract from ops list
+    for op in ops:
+        if _is_prim_op(op):
+            prim_ops.append(op)
+        else:
+            remaining_ops.append(op)
+
+    # Extract from ops_dict
+    for op, metadata in ops_dict.items():
+        if _is_prim_op(op):
+            prim_ops.append(op)
+        else:
+            remaining_ops_dict[op] = metadata
+
+    return prim_ops, remaining_ops, remaining_ops_dict
+
 # Hide the dependency to caffe2 internally.
 def et_operator_library(
         name,
@@ -91,6 +176,28 @@ def et_operator_library(
         ops_schema_yaml_target = None,
         server_generated_yaml_target = None,
         **kwargs):
+    # Check if we should extract prim ops from the operator lists
+    # Note that selective build for prim ops doesnt support model or ops_schema_yaml_target or server_generated_yaml_target
+    # TODO: Add support for selective build for prim ops with model or ops_schema_yaml_target or server_generated_yaml_target
+    should_extract_prim_ops = (ops or ops_dict) and not (model or ops_schema_yaml_target or server_generated_yaml_target or include_all_operators)
+
+    if should_extract_prim_ops:
+        # Extract prim ops from ops and ops_dict
+        prim_ops, remaining_ops, remaining_ops_dict = _extract_prim_ops_from_lists(ops, ops_dict)
+
+        # Use the remaining ops (with prim ops removed) for the main et_operator_library
+        final_ops = remaining_ops
+        final_ops_dict = remaining_ops_dict
+    else:
+        # No prim ops extraction needed - use original ops and ops_dict
+        prim_ops = []
+        final_ops = ops
+        final_ops_dict = ops_dict
+
+    selected_operator_yaml_filename = "selected_operators.yaml"
+    selected_prim_ops_filename = "selected_prim_ops.h"
+
+    # Generate the main operator library with the final ops
     # do a dummy copy if server_generated_yaml_target is set
     if server_generated_yaml_target:
         if include_all_operators or ops_schema_yaml_target or model or ops or ops_dict:
@@ -98,7 +205,7 @@ def et_operator_library(
         genrule_cmd = [
             "cp",
             "$(location {})".format(server_generated_yaml_target),
-            "$OUT",
+            "$OUT/{}".format(selected_operator_yaml_filename),
         ]
     else:
         genrule_cmd = [
@@ -109,12 +216,12 @@ def et_operator_library(
             genrule_cmd.append(
                 "--ops_schema_yaml_path=$(location {})".format(ops_schema_yaml_target),
             )
-        if ops:
+        if final_ops:
             genrule_cmd.append(
-                "--root_ops=" + ",".join(ops),
+                "--root_ops=" + ",".join(final_ops),
             )
-        if ops_dict:
-            ops_dict_json = struct_to_json(ops_dict)
+        if final_ops_dict:
+            ops_dict_json = struct_to_json(final_ops_dict)
             genrule_cmd.append(
                 "--ops_dict='{}'".format(ops_dict_json),
             )
@@ -127,6 +234,16 @@ def et_operator_library(
                 "--include_all_operators",
             )
 
+    prim_ops_genrule_cmd = [
+        "$(exe //executorch/codegen/tools:gen_selected_prim_ops)",
+        "--prim_op_names=" + ",".join(prim_ops),
+        "--output_dir=${OUT}",
+    ]
+
+    # Here we generate the selected_prim_ops.h and the selected_operators.yaml file
+    # both with single genrule
+    genrule_cmd = genrule_cmd + [" && "] + prim_ops_genrule_cmd
+
     # TODO(larryliu0820): Remove usages of this flag.
     if "define_static_targets" in kwargs:
         kwargs.pop("define_static_targets")
@@ -134,7 +251,8 @@ def et_operator_library(
         name = name,
         macros_only = False,
         cmd = " ".join(genrule_cmd),
-        out = "selected_operators.yaml",
+        outs = {selected_operator_yaml_filename: [selected_operator_yaml_filename], selected_prim_ops_filename: [selected_prim_ops_filename]},
+        default_outs = ["."],
         labels = ["et_operator_library"],
         **kwargs
     )
@@ -198,7 +316,6 @@ def _prepare_genrule_and_lib(
     if support_exceptions:
         genrule_cmd.append("--add-exception-boundary")
 
-
     # Sources for generated kernel registration lib
     sources = MANUAL_REGISTRATION_SOURCES if manual_registration else GENERATED_SOURCES
 
@@ -262,7 +379,8 @@ def _prepare_custom_ops_genrule_and_lib(
         custom_ops_yaml_path = None,
         support_exceptions = True,
         deps = [],
-        kernels = []):
+        kernels = [],
+        platforms = get_default_executorch_platforms()):
     """Similar to _prepare_genrule_and_lib but for custom ops."""
     genrules = {}
     libs = {}
@@ -281,6 +399,7 @@ def _prepare_custom_ops_genrule_and_lib(
                    "--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps])),
             outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
             default_outs = ["."],
+            platforms = platforms,
         )
 
         # genrule for generating operator kernel bindings
@@ -351,6 +470,7 @@ def exir_custom_ops_aot_lib(
         kernels = kernels,
         support_exceptions = support_exceptions,
         deps = deps,
+        platforms = platforms,
     )
     for genrule in genrules:
         runtime.genrule(
@@ -359,6 +479,7 @@ def exir_custom_ops_aot_lib(
             cmd = genrules[genrule]["cmd"],
             outs = genrules[genrule]["outs"],
             default_outs = ["."],
+            platforms = platforms,
         )
     for compiler_lib in libs:
         runtime.cxx_library(
@@ -429,7 +550,7 @@ def get_optimized_lib_deps():
         "//executorch/runtime/kernel:kernel_includes",
     ] + get_vec_deps()
 
-def build_portable_header_lib(name, oplist_header_name, feature = None):
+def build_portable_header_lib(name, oplist_header_name, feature = None, **kwargs):
     """Build the portable headers into a header-only library.
     Ensures that includes work across portable and optimized libs.
     """
@@ -437,21 +558,23 @@ def build_portable_header_lib(name, oplist_header_name, feature = None):
         name = name,
         srcs = [],
         exported_headers = {
-            "selected_op_variants.h":":{}[selected_op_variants]".format(oplist_header_name),
+            "selected_op_variants.h": ":{}[selected_op_variants]".format(oplist_header_name),
         },
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
         header_namespace = "",
         feature = feature,
+        **kwargs
     )
 
 def build_portable_lib(
-    name,
-    et_operator_lib_deps = [],
-    oplist_header_name = None,
-    portable_header_lib = None,
-    feature = None,
-    expose_operator_symbols = False,
-    visibility = ["@EXECUTORCH_CLIENTS"]):
+        name,
+        et_operator_lib_deps = [],
+        oplist_header_name = None,
+        portable_header_lib = None,
+        feature = None,
+        expose_operator_symbols = False,
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        platforms = get_default_executorch_platforms()):
     """
     WARNING: Before using this, please consider using executorch_generated_lib instead. This
     function is only for special cases where you need to build a portable kernel library with
@@ -530,9 +653,10 @@ def build_portable_lib(
         # @lint-ignore BUCKLINT link_whole
         link_whole = True,
         feature = feature,
+        platforms = platforms,
     )
 
-def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False):
+def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature = None, expose_operator_symbols = False, platforms = get_default_executorch_platforms()):
     """Build optimized lib from source. We build from source so that the generated header file,
     selected_op_variants.h, can be used to selectively build the lib for different dtypes.
     """
@@ -552,7 +676,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     # Currently fbcode links all dependent libraries through shared
     # library, and it blocks users like unit tests to use kernel
     # implementation directly. So we enable this for xplat only.
-    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed","-Wno-global-constructors","-Wno-shadow",]
+    compiler_flags = ["-Wno-missing-prototypes", "-Wno-pass-failed", "-Wno-global-constructors", "-Wno-shadow"]
     if not expose_operator_symbols and is_xplat():
         # Removing '-fvisibility=hidden' exposes operator symbols.
         # This allows operators to be called outside of the kernel registry.
@@ -565,6 +689,7 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
         exported_preprocessor_flags = ["-DEXECUTORCH_SELECTIVE_BUILD_DTYPE"],
         deps = get_portable_lib_deps() + get_optimized_lib_deps() + [":" + portable_header_lib],
         compiler_flags = compiler_flags,
+        platforms = platforms,
         preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
         # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
@@ -590,10 +715,9 @@ def build_optimized_lib(name, oplist_header_name, portable_header_lib, feature =
     )
 
 def selected_operators_genrule(
-    name,
-    deps,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        deps,
+        platforms = get_default_executorch_platforms()):
     """Generates selected_operators.yaml from the list of deps. We look into the trasitive closure of all the deps,
     and look for macros `et_operator_library`.
 
@@ -615,13 +739,36 @@ def selected_operators_genrule(
         platforms = platforms,
     )
 
+def selected_prim_operators_genrule(
+        name,
+        deps,
+        platforms = get_default_executorch_platforms()):
+    """Generates selected_prim_ops.h from the list of deps. We look into the transitive closure of all the deps,
+    and look for targets with label `et_operator_library`.
+
+    `combine_prim_ops_headers` is the python binary we use to aggregate all the `selected_prim_ops.h` headers
+    from `et_prim_ops_library` targets into a single combined `selected_prim_ops.h` file.
+
+    This file can be used to enable selective build for prim ops across multiple dependencies.
+    """
+    cmd = ("$(exe //executorch/codegen/tools:combine_prim_ops_headers) " +
+           "--header_files $(@query_outputs \'attrfilter(labels, et_operator_library, deps(set({deps})))\') " +
+           "--output_dir $OUT ").format(deps = " ".join(["\"{}\"".format(d) for d in deps]))
+    runtime.genrule(
+        name = name,
+        macros_only = False,
+        cmd = cmd,
+        outs = {"selected_prim_ops.h": ["selected_prim_ops.h"]},
+        default_outs = ["."],
+        platforms = platforms,
+    )
+
 def dtype_header_genrule(
-    name,
-    visibility,
-    deps = [],
-    selected_operators_genrule_name = None,
-    platforms = get_default_executorch_platforms(),
-):
+        name,
+        visibility,
+        deps = [],
+        selected_operators_genrule_name = None,
+        platforms = get_default_executorch_platforms()):
     """Generate selected_op_variants.h from selected_operators.yaml.
 
     Given a `selected_operators.yaml` (passed in as selected_operators_genrule_name), we should be able to determine
@@ -677,7 +824,8 @@ def executorch_generated_lib(
         dtype_selective_build = False,
         feature = None,
         expose_operator_symbols = False,
-        support_exceptions = True):
+        support_exceptions = True,
+        include_all_prim_ops = True):
     """Emits 0-3 C++ library targets (in fbcode or xplat) containing code to
     dispatch the operators specified in the provided yaml files.
 
@@ -738,6 +886,9 @@ def executorch_generated_lib(
         support_exceptions: enable try/catch wrapper around operator implementations
             to make sure exceptions thrown will not bring down the process. Disable if your
             use case disables exceptions in the build.
+        include_all_prim_ops: If true, include all prim ops in the generated library. This option
+            allows for selecting only some prim ops to reduce code size for extremely constrained
+            environments. For selecting only some prim ops, see examples in //executorch/examples/selective_build
     """
     if functions_yaml_target and aten_mode:
         fail("{} is providing functions_yaml_target in ATen mode, it will be ignored. `native_functions.yaml` will be the source of truth.".format(name))
@@ -783,15 +934,14 @@ def executorch_generated_lib(
                 index = index + 1
                 portable = name + "_check_portable_" + dep.split(":")[1] + str(index)
                 message = "Dtype selective build requires that the portable library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
-                check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message)
+                check_recursive_dependencies(portable, dep, "//executorch/kernels/portable:operators", message, platforms = platforms)
         if ("//executorch/kernels/optimized:optimized_operators" in kernel_deps):
             index = 0
             for dep in deps:
                 index = index + 1
                 optimized = name + "_check_optimized_" + dep.split(":")[1] + str(index)
                 message = "Dtype selective build requires that the optimized library is not passed into `deps`. This will cause duplicate symbol errors in the build. Please remove it from `deps` and place it into `kernel_deps`"
-                check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message)
-
+                check_recursive_dependencies(optimized, dep, "//executorch/kernels/optimized:optimized_operators", message, platforms = platforms)
 
     aten_suffix = "_aten" if aten_mode else ""
 
@@ -857,7 +1007,7 @@ def executorch_generated_lib(
     if dtype_selective_build:
         # Build portable headers lib. Used for portable and optimized kernel libraries.
         portable_header_lib = name + "_portable_header_lib"
-        build_portable_header_lib(portable_header_lib, oplist_header_name, feature)
+        build_portable_header_lib(portable_header_lib, oplist_header_name, feature, platforms = platforms)
 
         if "//executorch/kernels/portable:operators" in kernel_deps:
             # Remove portable from kernel_deps as we're building it from source.
@@ -865,7 +1015,7 @@ def executorch_generated_lib(
 
             # Build portable lib.
             portable_lib_name = name + "_portable_lib"
-            build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols)
+            build_portable_lib(name = portable_lib_name, portable_header_lib = portable_header_lib, feature = feature, expose_operator_symbols = expose_operator_symbols, platforms = platforms)
             kernel_deps.append(":{}".format(portable_lib_name))
 
         if "//executorch/kernels/optimized:optimized_operators" in kernel_deps:
@@ -874,7 +1024,7 @@ def executorch_generated_lib(
 
             # Build optimized lib.
             optimized_lib_name = name + "_optimized_lib"
-            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols)
+            build_optimized_lib(optimized_lib_name, oplist_header_name, portable_header_lib, feature, expose_operator_symbols, platforms = platforms)
             kernel_deps.append(":{}".format(optimized_lib_name))
 
     # Exports headers that declare the function signatures of the C++ functions
@@ -903,6 +1053,12 @@ def executorch_generated_lib(
 
     if name in libs:
         lib_name = name
+
+        if include_all_prim_ops:
+            prim_ops_registry_target = "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix
+        else:
+            prim_ops_registry_target = _get_prim_ops_registry_target(name, deps, aten_suffix, platforms)
+
         runtime.cxx_library(
             name = lib_name,
             srcs = [
@@ -927,7 +1083,7 @@ def executorch_generated_lib(
             }) + compiler_flags,
             deps = [
                 "//executorch/runtime/kernel:operator_registry" + aten_suffix,
-                "//executorch/kernels/prim_ops:prim_ops_registry" + aten_suffix,
+                prim_ops_registry_target,  # Use the appropriate prim ops registry
                 "//executorch/runtime/core:evalue" + aten_suffix,
                 "//executorch/codegen:macros",
             ] + deps + kernel_deps,
@@ -967,10 +1123,9 @@ def executorch_generated_lib(
 #
 # If build successfully, all of the `selected_operators.yaml` will be merged into 1 `selected_operators.yaml` for debugging purpose.
 def executorch_ops_check(
-    name,
-    deps,
-    **kwargs,
-):
+        name,
+        deps,
+        **kwargs):
     runtime.genrule(
         name = name,
         macros_only = False,
@@ -984,16 +1139,15 @@ def executorch_ops_check(
         platforms = kwargs.pop("platforms", get_default_executorch_platforms()),
         outs = {"selected_operators.yaml": ["selected_operators.yaml"]},
         default_outs = ["."],
-        **kwargs,
+        **kwargs
     )
 
 def check_recursive_dependencies(
-    name,
-    parent,
-    child,
-    message = "",
-    **kwargs,
-):
+        name,
+        parent,
+        child,
+        message = "",
+        **kwargs):
     """
     Checks if child is a transitive dependency of parent and fails if it is.
     The query runs the equivalent of `buck2 uquery "allpaths(parent, child)".
diff --git a/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
new file mode 100644
index 00000000000..73421f031ec
--- /dev/null
+++ b/shim_et/xplat/executorch/kernels/prim_ops/selective_build.bzl
@@ -0,0 +1,60 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def prim_ops_registry_selective(name, selected_prim_ops_header_target, aten_suffix="", **kwargs):
+    """
+    Create a selective prim ops registry target.
+
+    Args:
+        name: Name of the target to create
+        selected_prim_ops_header_target: Target that generates selected_prim_ops.h
+        aten_suffix: Suffix for aten mode (e.g. "_aten")
+        **kwargs: Additional arguments passed to runtime.cxx_library
+    """
+
+    target = "//executorch/kernels/prim_ops:prim_ops_sources"
+    header_target = "//executorch/kernels/prim_ops:selective_build_prim_ops.h"
+    source_name = "register_prim_ops.cpp"
+    header_name = "selective_build_prim_ops.h"
+    genrule_dep_name = name + "_register_prim_ops_srcs_copy"
+    runtime.genrule(
+        name = genrule_dep_name,
+        cmd = "cp -f $(location {})/{} $OUT/{} && cp -f $(location {})/{} $OUT/{} && cp -f $(location {selected_prim_ops_header_target})/selected_prim_ops.h $OUT/selected_prim_ops.h".format(
+            target, source_name, source_name,
+            header_target, header_name, header_name,
+            selected_prim_ops_header_target=selected_prim_ops_header_target
+        ),
+        outs = {
+            source_name: [source_name],
+            header_name: [header_name],
+            "selected_prim_ops.h": ["selected_prim_ops.h"]
+        },
+        platforms = kwargs.get("platforms", "CXX"),
+        default_outs = ["."],
+    )
+    runtime.cxx_library(
+        name = name,
+        srcs = [":" + genrule_dep_name + "[register_prim_ops.cpp]"],
+        exported_headers = {
+            "selective_build_prim_ops.h": ":" + genrule_dep_name + "[selective_build_prim_ops.h]",
+            "selected_prim_ops.h": ":" + genrule_dep_name + "[selected_prim_ops.h]"
+        },
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        # @lint-ignore BUCKLINT link_whole, need this to register prim ops.
+        link_whole = True,
+        # prim ops are registered through a global table so the ctor needs to be allowed
+        compiler_flags = select({
+            "DEFAULT": ["-Wno-global-constructors"],
+            "ovr_config//os:windows": [],
+        }) + ["-DET_PRIM_OPS_SELECTIVE_BUILD"],
+        deps = [
+            "//executorch/kernels/prim_ops:et_copy_index" + aten_suffix,
+            "//executorch/kernels/prim_ops:et_view" + aten_suffix,
+            "//executorch/runtime/core:evalue" + aten_suffix,
+            "//executorch/runtime/kernel:operator_registry" + aten_suffix,
+            "//executorch/runtime/kernel:kernel_includes" + aten_suffix,
+        ],
+        **kwargs
+    )
diff --git a/shim_et/xplat/executorch/kernels/test/util.bzl b/shim_et/xplat/executorch/kernels/test/util.bzl
index cefb4fae6f0..0c702d12a18 100644
--- a/shim_et/xplat/executorch/kernels/test/util.bzl
+++ b/shim_et/xplat/executorch/kernels/test/util.bzl
@@ -21,11 +21,13 @@ def op_test(name, deps = [], kernel_name = "portable", use_kernel_prefix = False
     if kernel_name == "aten":
         generated_lib_and_op_deps = [
             "//executorch/kernels/aten:generated_lib",
-            #TODO(T187390274): consolidate all aten ops into one target
-            "//executorch/kernels/aten/cpu:op__to_dim_order_copy_aten",
             "//executorch/kernels/aten:generated_lib_headers",
             "//executorch/kernels/test:supported_features_aten",
         ]
+          
+        if "dim_order" in op_root:
+            generated_lib_and_op_deps.append("//executorch/kernels/aten/cpu:" + op_root + "_aten")
+
     else:
         generated_lib_and_op_deps = [
             "//executorch/kernels/{}/cpu:{}".format(kernel_name, op_root),
diff --git a/src/executorch/examples/cuda b/src/executorch/examples/cuda
new file mode 120000
index 00000000000..aa2e50dd2cc
--- /dev/null
+++ b/src/executorch/examples/cuda
@@ -0,0 +1 @@
+../../../examples/cuda
\ No newline at end of file
diff --git a/test/end2end/exported_module.py b/test/end2end/exported_module.py
index 750b9097335..97deda4adf1 100644
--- a/test/end2end/exported_module.py
+++ b/test/end2end/exported_module.py
@@ -15,6 +15,7 @@
 import executorch.exir as exir
 import torch
 from executorch.exir import ExecutorchBackendConfig, ExecutorchProgramManager, to_edge
+from executorch.exir.capture._capture import patch_forward
 from executorch.exir.dynamic_shape import DynamicMemoryPlanningMode
 from executorch.exir.passes import (
     DebugPass,
@@ -70,6 +71,7 @@ def export(
         export_joint_graph: bool = False,
         external_constants: bool = False,
         export_state_names: bool = False,
+        share_mutable_buffers: bool = False,
     ) -> "ExportedModule":
         """
         Creates a new ExportedModule for the specified module class.
@@ -134,10 +136,13 @@ def return_wrapper():
             # all exported methods must have the same signature so just pick the first one.
             methods[0],
         )
-        trace_inputs: Sequence = get_trace_inputs()
+        inputs = get_trace_inputs()
         method_name_to_args = {}
         for method in methods:
-            method_name_to_args[method] = trace_inputs
+            if hasattr(eager_module, "get_random_inputs_per_method"):
+                # pyre-ignore
+                inputs = eager_module.get_random_inputs_per_method()[method]  # type: ignore[operator]
+            method_name_to_args[method] = inputs
 
         method_name_to_dynamic_shapes = None
         if hasattr(eager_module, "get_dynamic_shapes"):
@@ -149,23 +154,18 @@ def return_wrapper():
                 method_name_to_dynamic_shapes[method] = trace_dynamic_shapes
 
         memory_planning_pass = MemoryPlanningPass(
-            alloc_mutable_buffers=not export_state_names
+            alloc_mutable_buffers=not export_state_names,
+            share_mutable_buffers=share_mutable_buffers,
         )
         if hasattr(eager_module, "get_memory_planning_pass"):
             memory_planning_pass = eager_module.get_memory_planning_pass()  # type: ignore[operator]
 
-        class WrapperModule(nn.Module):
-            def __init__(self, method):
-                super().__init__()
-                self.forward = method
-
         exported_methods = {}
         # These cleanup passes are required to convert the `add` op to its out
         # variant, along with some other transformations.
         for method_name, method_input in method_name_to_args.items():
             # if not isinstance(eager_module, torch.nn.Module):
             if export_joint_graph:
-                # _export was having issues with WrapperModule.
                 assert method_name == "forward"
                 ep = _export(
                     eager_module,
@@ -179,15 +179,16 @@ def __init__(self, method):
                 )
                 exported_methods[method_name] = _export_forward_backward(ep)
             else:
-                exported_methods[method_name] = export(
-                    eager_module,
-                    method_input,  # type: ignore[arg-type]
-                    dynamic_shapes=(
-                        method_name_to_dynamic_shapes[method_name]
-                        if method_name_to_dynamic_shapes
-                        else None
-                    ),
-                )
+                with patch_forward(eager_module, getattr(eager_module, method_name)):
+                    exported_methods[method_name] = export(
+                        eager_module,
+                        method_input,  # type: ignore[arg-type]
+                        dynamic_shapes=(
+                            method_name_to_dynamic_shapes[method_name]
+                            if method_name_to_dynamic_shapes
+                            else None
+                        ),
+                    )
 
         exec_prog = to_edge(
             exported_methods,
@@ -229,6 +230,6 @@ def __init__(self, method):
             methods=methods,
             executorch_program=exec_prog,
             exported_program=exported_program,
-            trace_inputs=trace_inputs,
+            trace_inputs=inputs,
             get_random_inputs_fn=get_random_inputs_fn,
         )
diff --git a/test/models/export_program.py b/test/models/export_program.py
index fae75743eb3..ff5708f6685 100644
--- a/test/models/export_program.py
+++ b/test/models/export_program.py
@@ -262,6 +262,42 @@ def get_random_inputs(self):
         return (torch.randint(100, [1, 3], dtype=torch.long),)
 
 
+class ModuleSharedState(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.register_buffer("state", torch.ones(1))
+
+    def forward(self, x):
+        return self.state.add_(1) + x
+
+    def get_state(self):
+        return self.state
+
+    def set_state(self, x):
+        self.state.copy_(x)
+
+    # Including this is tech debt since we will immediately override it with the per method one.
+    # ExportedModule is really old infra though from before multiple methods were supported. So
+    # its really obnoxious to change.
+    def get_random_inputs(self):
+        return (torch.ones(1),)
+
+    def get_random_inputs_per_method(self):
+        return {
+            "forward": (torch.ones(1),),
+            "get_state": (),
+            "set_state": (torch.ones(1),),
+        }
+
+    @staticmethod
+    def get_method_names_to_export() -> List[str]:
+        return ["forward", "get_state", "set_state"]
+
+    @staticmethod
+    def share_mutable_buffers():
+        return True
+
+
 #
 # Main logic.
 #
@@ -280,21 +316,28 @@ def export_module_to_program(
         export_kwargs = module_class.get_export_kwargs()
     export_joint = False
     export_state_names = False
+    share_mutable_buffers = False
     if hasattr(module_class, "export_joint"):
-        export_joint = module_class.export_joint()  # pyre-ignore
+        # pyre-ignore[16]: pyre just cant figure it out
+        export_joint = module_class.export_joint()
     if hasattr(module_class, "export_state_names"):
+        # pyre-ignore[16]: pyre just cant figure it out
         export_state_names = module_class.export_state_names()
     if hasattr(module_class, "get_method_names_to_export"):
-        # pyre-ignore[16]: pyre doesn't know about get_export_kwargs.
+        # pyre-ignore[16]: pyre just cant figure it out
         methods = module_class.get_method_names_to_export()
     else:
         methods = ["forward"]
+    if hasattr(module_class, "share_mutable_buffers"):
+        # pyre-ignore[16]: pyre just cant figure it out
+        share_mutable_buffers = module_class.share_mutable_buffers()
     module = ExportedModule.export(
         module_class,
         methods,
         export_joint_graph=export_joint,
         external_constants=external_constants,
         export_state_names=export_state_names,
+        share_mutable_buffers=share_mutable_buffers,
         **export_kwargs,
     )
     return module.executorch_program
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
index 769fcb65ccd..e0b32f3223b 100644
--- a/test/models/targets.bzl
+++ b/test/models/targets.bzl
@@ -71,6 +71,7 @@ def define_common_targets():
         "ModuleDynamicCatUnallocatedIO",
         "ModuleSimpleTrain",
         "ModuleStateful",
+        "ModuleSharedState",
     ]
 
     # Generates Executorch .pte program files for various modules at build time.
diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh
index 1648f2ba434..5166d454e60 100755
--- a/test/run_oss_cpp_tests.sh
+++ b/test/run_oss_cpp_tests.sh
@@ -41,6 +41,7 @@ build_executorch() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
     -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
diff --git a/third-party/ao b/third-party/ao
index b99904b34c0..01849b2b19c 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit b99904b34c0fd98f8a63ec57cbc1dc4993f74793
+Subproject commit 01849b2b19cb923cb739a1fc02297ba418ddf715
diff --git a/tools/cmake/Codegen.cmake b/tools/cmake/Codegen.cmake
index 2a6bf42b48a..32d3d8b554f 100644
--- a/tools/cmake/Codegen.cmake
+++ b/tools/cmake/Codegen.cmake
@@ -399,6 +399,7 @@ function(executorch_load_build_variables)
       EXTENSION_EVALUE_UTIL_SRCS
       EXTENSION_FLAT_TENSOR_SRCS
       EXTENSION_MODULE_SRCS
+      EXTENSION_NAMED_DATA_MAP_SRCS
       EXTENSION_RUNNER_UTIL_SRCS
       EXTENSION_LLM_RUNNER_SRCS
       EXTENSION_TENSOR_SRCS
@@ -431,6 +432,7 @@ function(executorch_load_build_variables)
       _extension_evalue_util__srcs
       _extension_flat_tensor__srcs
       _extension_module__srcs
+      _extension_named_data_map__srcs
       _extension_runner_util__srcs
       _extension_llm_runner__srcs
       _extension_tensor__srcs
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index fceeb594f53..ba18aede63e 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -53,6 +53,7 @@ set(EXECUTORCH_FOUND ON)
 include("${CMAKE_CURRENT_LIST_DIR}/ExecuTorchTargets.cmake")
 
 set(optional_lib_list
+    aoti_cuda
     flatccrt
     etdump
     bundled_program
diff --git a/tools/cmake/preset/android.cmake b/tools/cmake/preset/android.cmake
index d794e8fcef3..5c9bc97e3ef 100644
--- a/tools/cmake/preset/android.cmake
+++ b/tools/cmake/preset/android.cmake
@@ -23,6 +23,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
diff --git a/tools/cmake/preset/apple_common.cmake b/tools/cmake/preset/apple_common.cmake
index 7b4ec420996..27ec35aa43e 100644
--- a/tools/cmake/preset/apple_common.cmake
+++ b/tools/cmake/preset/apple_common.cmake
@@ -28,6 +28,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/arm_baremetal.cmake b/tools/cmake/preset/arm_baremetal.cmake
index 33a12969484..882780ade1d 100644
--- a/tools/cmake/preset/arm_baremetal.cmake
+++ b/tools/cmake/preset/arm_baremetal.cmake
@@ -5,6 +5,8 @@
 
 set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}")
 set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
 set_overridable_option(EXECUTORCH_BUILD_ARM_BAREMETAL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
@@ -18,7 +20,6 @@ define_overridable_option(
 if("${EXECUTORCH_BUILD_ARM_ETDUMP}")
   set(EXECUTORCH_BUILD_DEVTOOLS ON)
   set(EXECUTORCH_ENABLE_EVENT_TRACER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF)
   set(FLATCC_ALLOW_WERROR OFF)
 else()
   set(EXECUTORCH_ENABLE_EVENT_TRACER OFF)
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index fb0dc0a4ade..04e84622589 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -67,11 +67,11 @@ define_overridable_option(
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension" BOOL
-  OFF
+  ON # Required by executor_runner
 )
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension" BOOL OFF
@@ -86,6 +86,10 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP
+  "Build the Named Data Map extension" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" BOOL OFF
 )
@@ -145,6 +149,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_CUDA "Build the CUDA backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF
 )
@@ -176,6 +183,36 @@ define_overridable_option(
   ${_default_executorch_build_cpuinfo}
 )
 
+# Threadpool size options. At most one can be specified. Note that the default
+# is managed in threadpool.cpp to allow the user to specify an alternate mode
+# without needing to explicitly set the default to off.
+define_overridable_option(
+  EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES
+  "Set the number of threads used for CPU parallel computation equal to the number of performant CPU cores."
+  BOOL
+  OFF
+)
+define_overridable_option(
+  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
+  "Set the number of threads used for CPU parallel computation equal to the number of logical CPU cores."
+  BOOL
+  OFF
+)
+
+check_required_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES REQUIRES
+  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
+)
+check_required_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES REQUIRES
+  EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO
+)
+
+check_conflicting_options_on(
+  IF_ON EXECUTORCH_THREADPOOL_USE_PERFORMANCE_CORES CONFLICTS_WITH
+  EXECUTORCH_THREADPOOL_USE_ALL_LOGICAL_CORES
+)
+
 # TODO(jathu): move this to platform specific presets when created
 set(_default_executorch_build_executor_runner ON)
 if(APPLE AND "${SDK_NAME}" STREQUAL "iphoneos")
@@ -277,6 +314,12 @@ check_required_options_on(
 check_required_options_on(
   IF_ON EXECUTORCH_BUILD_EXTENSION_MODULE REQUIRES
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR
+  EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP
+)
+
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP REQUIRES
+  EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
 )
 
 check_required_options_on(
@@ -342,6 +385,10 @@ check_required_options_on(
   EXECUTORCH_BUILD_EXTENSION_LLM
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
+)
+
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(
     FATAL_ERROR
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
index e29fc7c4287..6cd2482f717 100644
--- a/tools/cmake/preset/llm.cmake
+++ b/tools/cmake/preset/llm.cmake
@@ -10,6 +10,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/profiling.cmake b/tools/cmake/preset/profiling.cmake
index a73c340078c..640a84b261c 100644
--- a/tools/cmake/preset/profiling.cmake
+++ b/tools/cmake/preset/profiling.cmake
@@ -9,6 +9,7 @@
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake
index c7ad94cd8be..c71c10ad01f 100644
--- a/tools/cmake/preset/pybind.cmake
+++ b/tools/cmake/preset/pybind.cmake
@@ -17,17 +17,24 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_LLM_AOT ON)
+set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 
+# TODO(larryliu0820): Temporarily disable building llm_runner for Windows wheel
+# due to the issue of tokenizer file path length limitation.
 if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TRAINING ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER ON)
+  set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL
                                                "WIN32"
 )
diff --git a/tools/cmake/preset/windows.cmake b/tools/cmake/preset/windows.cmake
index b75a5af578e..5123dfc956d 100644
--- a/tools/cmake/preset/windows.cmake
+++ b/tools/cmake/preset/windows.cmake
@@ -10,6 +10,7 @@ set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
+set_overridable_option(EXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
diff --git a/torch_pin.py b/torch_pin.py
new file mode 100644
index 00000000000..bb8d32d4716
--- /dev/null
+++ b/torch_pin.py
@@ -0,0 +1,19 @@
+# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
+# pip versions will have the required features.
+#
+# NOTE: If a newly-fetched version of the executorch repo changes the value of
+# NIGHTLY_VERSION, you should re-run install_executorch.sh script to install the necessary
+# package versions.
+#
+# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
+# by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
+#
+# NOTE: If you're changing, make the corresponding supported CUDA versions in
+# SUPPORTED_CUDA_VERSIONS above if needed.
+TORCH_VERSION = "2.10.0"
+NIGHTLY_VERSION = "dev20251004"
+SUPPORTED_CUDA_VERSIONS = (
+    (12, 6),
+    (12, 8),
+    (13, 0),
+)